def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def load_file(title, out_path): path = '../../../datasets/newspapers_clean/{}'.format(title) print(path) allFiles = glob.glob(path + "/articles/*.tsv") print(allFiles) for f in allFiles: df = pd.read_csv(f, delimiter='\t', parse_dates=True) df = df.dropna(subset=['ocr']) # remove lines with empty ocr field # remove duplicate header rows df = df[~df['date'].str.contains('date')] # remove files that contain error msg excludes = ['objecttype', 'file directory not found'] df = df[~df['ocr'].astype(str).str.contains('|'.join(excludes))] df['date'] = pd.to_datetime(df['date']) year = df['date'].dt.year[0] print('making sentences: {}'.format(year)) df['ocr'] = df['ocr'].apply(lambda x: unidecode.unidecode(x)) docs = df['ocr'].values CORPUS_FILE = (out_path + '/{}_{}.txt'.format(title, year)) save_as_line_sentence(process_corpus(docs), CORPUS_FILE)
def test_sg_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) utils.save_as_line_sentence(lee_data, corpus_file) model_gensim.build_vocab(corpus_file=corpus_file) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(corpus_file=corpus_file, total_words=model_gensim.corpus_total_words, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def process_and_save(self): logging.info('Start processing of file.') try: text = codecs.open(os.path.join(tpath, self.input), 'r', encoding='utf-8', errors='ignore').readlines() text = remove_punctuation(text) text = remove_double_spaces(text) text = remove_noisy_digits(text) text = remove_dash_and_minus_signs(text) text = replace_digits(text) text = remove_double_spaces(text) text = reduce_numerical_sequences(text) text = filter_doc(text) text = [removeGermanChainWords(line) for line in text] logging.info('Chainword splitting finished') text = [remove_hyphens(line) for line in text] text = [lemmatizer.lemmatize(line) for line in text] logging.info('Lemmatizing finished') text = [lowercase(line) for line in text] text = [removeUmlauts(line) for line in text] text = [harmonizeSpelling(line) for line in text_preprocessing] if self.input.endswith('.txt'): save_as_line_sentence(text, f'{self.input[:-4]}_processed.txt') else: save_as_line_sentence(text, f'{self.input}_processed.txt') logging.info('Processing finished') except FileNotFoundError: print(f'File was not found.')
def test_save_as_line_sentence_ru(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.smart_open(corpus_file, encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_save_as_line_sentence_ru(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.open(corpus_file, 'rb', encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_save_as_line_sentence_en(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.smart_open(corpus_file, encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_save_as_line_sentence_en(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.open(corpus_file, 'rb', encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_online_learning_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def test_persistence_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def test_online_learning_after_save_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def process_and_save(self): logging.info(f'Start processing of files from folder {dirname}') i = 0 files_total = len(os.listdir(self.dirname)) logging.info(f'{files_total} files were found.') border = round(files_total / 10) for num in range(1, files_total + 1): if not os.path.isfile( os.path.join(f'{self.dirname}_processed', f'{num}_sents.txt')): try: text = codecs.open(os.path.join(self.dirname, f'{num}_sents.txt'), 'r', encoding='utf-8').readlines() # Steps that are applied only when kind == 'BRD' have already been applied to Reichstag protocols when extracting them from the original documents if self.kind == 'BRD': regex_patterns = bundestag_patterns() text = remove_punctuation(text) text = remove_double_spaces(text) text = extract_protocol_bundestag( text, *regex_patterns) text = remove_linebreaks(text) if self.kind == 'BRD': text = remove_noisy_digits(text) text = remove_dash_and_minus_signs(text) text = replace_digits(text) text = remove_double_spaces(text) text = reduce_numerical_sequences(text) text = filter_doc(text) text = [remove_german_chainwords(line) for line in text] text = [remove_hyphens(line) for line in text] text = [lemmatizer.lemmatize(line) for line in text] text = [lowercase(line) for line in text] text = [remove_umlauts(line) for line in text] text = [ harmonizeSpelling(line, spelling_dict) for line in text ] save_as_line_sentence( text, f'{self.dirname}_processed/{num}_sents.txt') i += 1 if i % border == 0: logging.info( 'Processing {:03.1f} percent finished'.format( int((i / files_total) * 100))) except FileNotFoundError: print(f'File {num} was not found.')
def save_lee_corpus_as_line_sentence(corpus_file): utils.save_as_line_sentence((doc.words for doc in DocsLeeCorpus()), corpus_file)
word_count = 0 for line in text_stream: # If under a set number of words, then include next comment if word_count < 35000000: line = json.loads(line) post = line['body'] sub = line['subreddit'] # Uncomment if only reading from specific subreddits #if sub in subs: if post not in remove: # Process post as required for a word2vec corpus processed_post = utils.simple_preprocess(post) comment_size = len(processed_post) # Ensure each comment reaches length threshold if comment_size >= 10: # Increase word and comment counts comment_count += 1 word_count += comment_size # Return yield processed_post else: break # Print word count upon completion print('Number of comments in corpus: {}'.format(comment_count)) print('Number of total words in corpus: {}'.format(word_count)) corpus = MyCorpus() # Save corpus as a line_sentence for a word2vec model to be made from utils.save_as_line_sentence(corpus, r"C:\Users\Eric\Documents\COG 403\Project\Reddit\Data\Comments\RC_2019-09-news.txt")
directory = r'C:\Users\Eric\Documents\COG 403\Project\Reddit\Data\CNN\cnn\stories' story_count = 0 word_count = 0 for filename in os.listdir(directory): if word_count >= 35000000: break name = directory + '\\' + filename f = open(name, encoding='utf-8') f.readline() line = f.readline() while line == '\n': line = f.readline() while line[0] != '@': processed_line = utils.simple_preprocess(line) word_count += len(processed_line) line = f.readline() while line == '\n': line = f.readline() yield processed_line story_count += 1 print('Number of stories in corpus: {}'.format(story_count)) print('Number of total words in corpus: {}'.format(word_count)) corpus = MyCorpus() # Save corpus as a line_sentence for a word2vec model to be made from utils.save_as_line_sentence( corpus, r"C:\Users\Eric\Documents\COG 403\Project\Reddit\Data\CNN\cnn_corpus.txt")