def test_process_sentence_russian(self): sentences = gensent.SentenceGenerator(language='russian', lemma=True) result = sentences._process_sentence( "Три девицы под окном Пряли поздно вечерком.") correct = [ 'три', 'девица', 'под', 'окно', 'прясть', 'поздно', 'вечерок' ] self.assertEqual(result, correct)
def test_dutch(self): sentences = gensent.SentenceGenerator(language='dutch', lemma=True) result = sentences._process_sentence("Ik ga naar buiten toe") correct = ['ik', 'gaan', 'naar', 'buiten', 'toe'] self.assertEqual(result, correct)
def test_process_numbers(self): sentences = gensent.SentenceGenerator() result = sentences._process_sentence("Pi is 3.14159") correct = ['pi', 'is', sentences.NUM] self.assertEqual(result, correct)
def test_process_EU_money(self): sentences = gensent.SentenceGenerator() result = sentences._process_sentence("Breakfast cost me €5.60") correct = ['breakfast', 'cost', 'me', sentences.NUM] self.assertEqual(result, correct)
def test_process_sentence(self): sentences = gensent.SentenceGenerator() result = sentences._process_sentence(self.sentence_list[0]) correct = ['i', 'am', 'sam', 'sam-i-am'] self.assertEqual(result, correct)
def test_two_passes(self): """Make sure we can make two passes over the sentence generator iterator.""" sentences = gensent.SentenceGenerator() sentences.read_sentence_list(self.sentence_list) #the list() function makes one pass over an iterator, so just do it 2x self.assertEqual(list(sentences), list(sentences))
def test_generator_unprepared(self): """Make sure an unprepared sentence generator throws an error.""" sentences = gensent.SentenceGenerator() self.assertRaises(Exception, sentences._gen_sentences())
action='store_true', default=False, help='lemmatize the sentences before training word vectors') args = parser.parse_args() return args args = parse_args() print('Working on Dutch...') start_time = time.time() nl_direc = os.path.join(args.data_dir, 'nl') nl_sents = gensent.SentenceGenerator(language='dutch', lemma=args.lemma, cstlemma_dir=args.cstlemma_dir) nl_sents.read_directory(nl_direc) nl_model = gensim.models.Word2Vec(nl_sents, **w2vconfig.gensim_config) nl_vectors = nl_model.wv print('Dutch word tokens: {}'.format(nl_sents.word_token_count)) print('Dutch vocab size: {}'.format(len(nl_model.wv.vocab))) if args.lemma: nl_vectors_fp = os.path.join(args.vectors_dir, 'nl_vectors_lemma.txt') else: nl_vectors_fp = os.path.join(args.vectors_dir, 'nl_vectors_nolemma.txt') nl_vectors.save_word2vec_format(nl_vectors_fp, binary=False) elapsed_time = time.time() - start_time print('Elapsed time:', elapsed_time)