def get_bigram_phraser(directory): if os.path.isfile(BIGRAM): return Phraser.load(BIGRAM) else: bigram = Phraser(Phrases(corpus(directory))) bigram.save(BIGRAM) return bigram
def train(args): # Output during training logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # use text8 corpus as training data, haikus dont provide sufficient context training_data = api.load('text8') # use the phrase model to recognize bigrams like "White House" or "Climate Change" bigram_model = Phrases(training_data) # Export the trained model = use less RAM, faster processing. Model updates no longer possible. bigrams = Phraser(bigram_model) # # create and train model model = Word2Vec(bigrams[training_data], size=args.embedding_dim) word_list = list(model.wv.vocab.keys()) vector_list = [model[word] for word in word_list] # the basic model doesnt seem to be supporting item assignment # but WordEmbeddingsKeyedVectors does kv = WordEmbeddingsKeyedVectors(args.embedding_dim) kv.add(word_list, vector_list) kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim)) # just to be safe, clear the cache of normalized vectors # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532 del kv.vectors_norm # save the new models bigrams.save(f"{args.model_path}/bigram.model") kv.save(f"{args.model_path}/word2vec.model")
def fit(self, sentencesPath): """ train phrases :param sentencesPath:the path of text file, the text file should be the format: one line one sentence """ self.phrasers = [] # path detect for path in self.savePhraserPaths: if not os.path.exists(os.path.dirname(path)): raise FileNotFoundError(os.path.dirname(path) + " not exist") for path in self.savePhraserPaths: if not os.path.exists(path): # need train self.phrasers = None break if self.phrasers is not None and self.file_overwrite == False: logging.info("models are already exist, will read it") for path in self.savePhraserPaths: self.phrasers.append(Phraser.load(path)) return True self.phrasers = [] c = 2 for path in self.savePhraserPaths: logging.info("getting %d-gram phrase......" % c) c += 1 phraser = Phraser( Phrases(sentences=TxtIter(sentences=codecs.open( sentencesPath, mode="r", encoding="utf-8"), ngrams=self.phrasers), min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, scoring=self.scoring)) phraser.save(path) self.phrasers.append(phraser)
def train_w2v_model() -> (Phraser, Word2Vec): # Build Word2Vec model if not Path(model_file).exists(): sent = [row.split() for row in df['clean_lyrics'] if row] # Build collocations if not Path(bigrams_file).exists(): bigram_phrases = Phrases(sent, min_count=30, progress_per=10000, max_vocab_size=200000, common_terms=sentiment_terms) bigram = Phraser(bigram_phrases) bigram.save(bigrams_file) trigram_phrases = Phrases(bigram[sent], min_count=30, progress_per=10000, max_vocab_size=200000, common_terms=sentiment_terms) trigram = Phraser(trigram_phrases) trigram.save(trigrams_file) trigram = Phrases.load(trigrams_file) sentences = trigram[sent] cores = multiprocessing.cpu_count() w2v_model = Word2Vec( min_count=20, # Remove rare words window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores - 1) t = time() w2v_model.build_vocab(sentences, progress_per=10000) print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) w2v_model.vocabulary.save(vocabulary_file) t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format( round((time() - t) / 60, 2))) w2v_model.save(model_file) trigram = Phrases.load(trigrams_file) w2v_model = Word2Vec.load(model_file) return trigram, w2v_model
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system'])
def make_bi_tri(paths, tri=False): sentences = PathLineSentences(paths) phases = Phrases(sentences) bigram = Phraser(phases) bigram.save() if tri: triphases = Phrases(bigram[sentences]) trigram = Phraser(triphases) trigram.save()
def get_trigram_phraser(directory): if os.path.isfile(TRIGRAM): return Phraser.load(TRIGRAM) else: bigram = get_bigram_phraser(directory) sentence_stream = (bigram[sentence] for sentence in corpus(directory)) trigram = Phraser(Phrases(sentence_stream)) trigram.save(TRIGRAM) return trigram
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]], ['graph_minors', 'survey', 'human_interface', 'system'])
def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def make_trigrams( sentences: Iterable, save_model_path: Path, **phrases_kw ): """Entrena modelo de bigramas de gensim.""" bigram = Phrases(sentences, **phrases_kw) bigram_phraser = Phraser(bigram) tokens = bigram_phraser[sentences] trigram = Phrases(tokens, delimiter=b" ") trigram_phraser = Phraser(trigram) trigram_phraser.save(str(save_model_path))
def mk_bigrams(): with open(dump_base + "judgments", 'r', encoding="utf-8") as f: judgments = f.read() sentences = [list(gensim.utils.simple_tokenize(s)) for s in textcleaner.split_sentences(judgments)] bigramer = Phraser(Phrases(sentences)) bigramer.save(dump_base + "bigramer") return [bigramer[s] for s in sentences]
def ngram_model_to_disk(sents, output_fp): ''' helper function that saves ngram model to disk and returns it for further use ''' ngrams = Phrases(sents, min_count=40, common_terms=frozenset(en.STOP_WORDS)) ngram_phraser = Phraser(ngrams) ngram_phraser.save(output_fp) return ngram_phraser
def train_phraser(sentence_stream, stopword_list, threshold, model_path, save_prefix): phrases_model = Phrases(sentence_stream, common_terms=stopword_list, threshold=threshold) phrases_model.save( os.path.join(model_path, '{}_phrases.bin'.format(save_prefix))) phraser_model = Phraser(phrases_model) phraser_model.save( os.path.join(model_path, '{}_phraser.bin'.format(save_prefix))) return phraser_model
def create_n_grams(text): bigram = Phrases(text, min_count=20, threshold=10, delimiter=b' ') bigram_phraser = Phraser(bigram) bigram_phraser.save("./bigram_model.pkl") tokens_bigram = bigram_phraser[text] trigram = Phrases(tokens_bigram, min_count=10, threshold=10, delimiter=b' ') trigram_phraser = Phraser(trigram) trigram_phraser.save("./trigram_model.pkl") tokens_trigram = trigram_phraser[tokens_bigram] return tokens_bigram, tokens_trigram
def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer)
class GramFacade: def __init__(self, model_dir, min_count_bigrams=8, min_count_trigrams=7): self.model_dir = model_dir self.min_count_bigrams = min_count_bigrams self.min_count_trigrams = min_count_trigrams def load_models(self): self.bigrams_phraser = Phraser.load(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser = Phraser.load(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) def load_phrases(self): self.bigrams_phrases = Phrases.load(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases = Phrases.load(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def export_bigrams(self, docs): return [self.bigrams_phraser[doc] for doc in docs] def export_trigrams(self, bigrams): return [self.trigrams_phraser[bigram] for bigram in bigrams] def phrase(self, doc): bigrams = self.bigrams_phraser[doc] trigrams = self.trigrams_phraser[bigrams] return trigrams def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, min_count=self.min_count_bigrams) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], min_count=self.min_count_trigrams) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def words_not_in_vocab(self, tok_doc, threshold): word_not_in_doc = set([ x for x in tok_doc if self.trigrams_phrases.vocab[str.encode(x)] < threshold ]) return word_not_in_doc
def build_phraser(self, threshold: int=None): tokens = ReadThreads( self.board, self.input_dir, return_func=lambda x, y: y.split()) bigram = Phrases(tokens, min_count=5, threshold=threshold) trigram = Phrases(bigram[tokens], threshold=threshold) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) filename = op.join(self.input_dir, f'{self.board}.bigrams') bigram_mod.save(filename) filename = op.join(self.input_dir, f'{self.board}.trigrams') trigram_mod.save(filename) return trigram_mod
def phrase_detect_train(sentances,min_count,threshold,common_terms,phrase_model_save_path = None): """ input: sentances: tokenized sentances """ print('Transform sentances to trigrams .........\n') bi_phrases = Phrases(sentances, min_count=min_count, threshold=threshold,common_terms=common_terms) bigram_transformer = Phraser(bi_phrases) if phrase_model_save_path is not None: bi_phrases.save(phrase_model_save_path) bigram_transformer.save(phrase_model_save_path+'_transformer') sentances = list(bigram_transformer[sentances]) ## if you want to check pharses list pharses_list = list(bigram_transformer.phrasegrams) print('Phrase model training done.') return sentances
def build_ngram_model(docs): bigram_model_path = Path('bigram_phraser.pkl') trigram_model_path = Path('trigram_phraser.pkl') if not bigram_model_path.exists() or not trigram_model_path.exists(): print('Building n-gram models') bigram = Phrases(docs, min_count=3, threshold=6) trigram = Phrases(bigram[docs], min_count=3, threshold=6) bigram_model = Phraser(bigram) trigram_model = Phraser(trigram) bigram_model.save('bigram_phraser.pkl') trigram_model.save('trigram_phraser.pkl') else: print('Loading saved n-gram models') bigram_model = Phraser.load('bigram_phraser.pkl') trigram_model = Phraser.load('trigram_phraser.pkl') return (bigram_model, trigram_model)
def get_phraser(self): """ Get trained phraser or train a new phraser to extract the phrases""" phraser_path = os.path.join(self.data_path, self.phraser_name) if os.path.isfile(phraser_path): return Phraser.load(phraser_path) challenge_req = self.get_challenge_req() sentences = [ tokenize_str(req) for cha_id, req in challenge_req.itertuples() ] phrases = Phrases(sentences=sentences, min_count=1, threshold=0.2, common_terms=TC_STOP_WORDS, scoring='npmi') trained_phraser = Phraser(phrases) trained_phraser.save(phraser_path) return trained_phraser
def extract_phrases(df: pd.DataFrame): """ Train bigram and trigram phrasers Input: - df: dataframe with column "text" """ def wrapper(generator): for item in generator: yield item.text.split(" ") vocab = Counter() vocab_final = Counter() bigram_phrases = Phrases(wrapper(df.itertuples()), min_count=5, threshold=1) bigram = Phraser(bigram_phrases) trigram_phrases = Phrases(bigram[wrapper(df.itertuples())], min_count=5, threshold=1) trigram = Phraser(trigram_phrases) bigram.save("./vocab/bigram") trigram.save("./vocab/trigram")
def BuildPhraser(save_to_file=True, model_file_name=os.getcwd() + "/models/" + "bigram_model.pkl", min_count=10, threshold=.7, common_terms=STOPWORDS, training_data=None): # Load training data. sentences = Text8Corpus(training_data) # Train bigram model. phrases = Phrases(sentences, min_count=min_count, threshold=threshold, common_terms=common_terms) # Export the trained model = use less RAM, faster processing. Model updates no longer possible. bigram_model = Phraser(phrases) # save the model to file if save_to_file: bigram_model.save(fname_or_handle=model_file_name) return bigram_model
def build_phrase_model(): phrase_list = load_phrases() phrases = Phrases(Corpus(CORPUS_FILE)) bigrams = Phraser(phrases) bigrams.save(MODEL_FILE) years = Corpus(CORPUS_FILE).get_years() authors = Corpus(CORPUS_FILE).get_authors() with open(OUT_FILE, "w") as f: for i, line in tqdm(enumerate(bigrams[Corpus(CORPUS_FILE)])): line = remove_under(line) line = check_phrase_list(phrase_list, line) line = [authors[i]] + line line = [years[i]] + line f.write("{}\n".format(" ".join(remove_under(line))))
def compute_bigram(self): ''' Find and save bigrams living among the tweets :update: [covid_tweets].[token_tweets] ''' print("Computing bigram.") cnxn = sqlite3.connect("covid_tweets.db") cursor = cnxn.cursor() count_query = ''' SELECT count(tweet_id) FROM token_tweets WHERE date = ?''' cursor.execute(count_query, (self.date, )) num_tweets = cursor.fetchone()[0] print(self.date, num_tweets, "to have bigram computed.") query = ''' SELECT tweet_id, tokenized_tweet FROM token_tweets WHERE date = ?''' cursor.execute(query, (self.date, )) results = cursor.fetchall() cnxn.close() retokenized_tweets = [] for tweet_id, tokenized_tweet in results: tweet_tokens = tokenized_tweet.split(" ") retokenized_tweets.append(tweet_tokens) phrases = Phrases(retokenized_tweets, min_count=self.b_min) bigram = Phraser(phrases) bigram.save(f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl") print("Bigram computed.")
def main(): sentence_stream = [] start = timeit.default_timer() print('start the reuters') extract("/home/huicheng/Documents/datas/ReutersNews106521", sentence_stream) print(len(sentence_stream)) print('start the bloombergs') #extract("/home/huicheng/Documents/datas/20061020_20131126_bloomberg_news", sentence_stream) print(len(sentence_stream)) print('start ours') #new(sentence_stream) print('before:{}'.format(len(sentence_stream))) print(timeit.default_timer() - start) start = timeit.default_timer() sentence_stream = list(filter(None, sentence_stream)) print('after:{}'.format(len(sentence_stream))) print(timeit.default_timer() - start) print('generating phrase and word2vec') start = timeit.default_timer() os.chdir("/home/huicheng/Documents/datas/") with open("sentence.csv", "w") as f: writer = csv.writer(f) writer.writerows(sentence_stream) phrases = Phrases(sentence_stream,min_count=500, threshold=2) bigram = Phraser(phrases) # print(list(bigram[sentence_stream])) print(bigram['u', 's', 'wall', 'st', 'wall', 'street','s','p','500','s','p','xxx']) ##TODO phrase bigram.save("big_phrase.pickle") print('finish phrase time:{}'.format(timeit.default_timer() - start)) print('start trigram') start = timeit.default_timer() phrases = Phrases(bigram[sentence_stream],min_count=500, threshold=2) trigram = Phraser(phrases) trigram.save("trig_phrase.pickle") print(trigram[bigram['u', 's', 'wall', 'st', 'wall', 'street','bank','of','america','s','p','500','s','p','xxx']]) print('finish phrase time:{}'.format(timeit.default_timer() - start)) '''##TODO word2vec
def save_models(dataset_name, num_topics): # load inputs and labels dataset = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv").astype(str).values.tolist() # remove placeholders from the stems dataset for index, sample in enumerate(dataset): dataset[index] = list(filter((" ").__ne__, sample)) # create dic, copora and lda-model dic = gs.corpora.Dictionary(dataset) dic.save("../models/dictionary/" + dataset_name + "_dictionary") corpus = [dic.doc2bow(sample) for sample in dataset] lda_model = gensim.models.ldamulticore.LdaMulticore( corpus=corpus, id2word=dic, num_topics=num_topics, random_state=100, chunksize=100, passes=10, per_word_topics=True) # update_every=1, lda_model.save("../models/topic_models/" + dataset_name + "_ldamodel") inputs = [" ".join(sentence) for sentence in dataset] vector_model = FastText(size=32, window=3, min_count=1) vector_model.build_vocab(inputs) vector_model.train(sentences=inputs, total_examples=len(inputs), total_words=vector_model.corpus_total_words, epochs=10) vector_model.save("../models/word_embeddings/" + dataset_name + "_fasttext") # make bigram model sentences = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv")["t"].tolist() tokenized = [t.split() for t in sentences] phrases = Phrases(tokenized) bigram = Phraser(phrases) bigram.save("../models/bigrams/bigram_" + dataset_name + ".pkl")
class GramFacade: def __init__(self, model_dir, bigrams_threshold=0.88, trigrams_threshold=0.88): self.model_dir = model_dir self.bigrams_threshold = bigrams_threshold self.trigrams_threshold = trigrams_threshold def load_models(self): self.bigrams_phraser = Phraser.load(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser = Phraser.load(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) def load_phrases(self): self.bigrams_phrases = Phrases.load(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases = Phrases.load(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def export_bigrams(self, docs): return [self.bigrams_phraser[doc] for doc in docs] def export_trigrams(self, bigrams): return [self.trigrams_phraser[bigram] for bigram in bigrams] def phrase(self, doc): bigrams = self.bigrams_phraser[doc] trigrams = self.trigrams_phraser[bigrams] return trigrams def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, scoring='npmi', threshold=self.bigrams_threshold) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], scoring='npmi', threshold=self.trigrams_threshold) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def words_not_in_vocab(self, tok_doc, threshold): word_not_in_doc = set([ x for x in tok_doc if self.trigrams_phrases.vocab[str.encode(x)] < threshold ]) return word_not_in_doc def retrieve_grams(self): pgrams = self.trigrams_phraser.phrasegrams gram_list = [] for word, values in pgrams.items(): gram = b'_'.join(word) count, score = values[0], values[1] gram_list.append({ "gram": gram.decode("utf-8"), "count": count, "score": score }) gram_sorted = sorted(gram_list, key=lambda x: x["score"], reverse=True) return gram_sorted
data = file.readlines() #prepare data for phraser sentence_stream = [line.split(" ") for line in data] bigram = Phrases(sentence_stream, min_count=3, threshold=5, delimiter=b'%%') trigram = Phrases(bigram[sentence_stream], min_count=3, threshold=5, delimiter=b'%%') bigram_phraser = Phraser(bigram) trigram_phraser = Phraser(trigram) # want to see the phrases use: #print(trigram_phraser.phrasegrams.items()) bigram.save("phrase_bigram.model") trigram.save("phrase_trigram.model") bigram_phraser.save("phraser_bigram.model") trigram_phraser.save("phraser_trigram.model") #Testing Sentences... sent1 = [u'der', u'Dialog', u'im', u'Dunkeln', u'in', u'Hamburg'] sent2 = [u'Samstag', u'ist', u'die', u'Lange', u'Nacht', u'der', u'Museen'] sent3 = [u'Sonderaustellung', u'Archäologische', u'Museum', u'Hamburg', u'lustig'] sent4 = [u'Early', u'Bird', u'Ticket', u'Hamburg'] sent5 = [u'Hafen', u'City', u'Hamburg'] sent6 = [u'FC', u'St', u'Pauli', u'spielt'] sent7 = [u'das', u'Tor', u'zur', u'Welt'] sent8 = [u'Museum', u'für', u'Hamburger', u'Geschichte'] sent9 = [u'Besuch', u'im', u'Alten', u'Elbtunnel']
class Builder(object): def __init__(self, ndocs, phrase_min_count=5, vocabulary_size=10000, bigram_min_count=5, bigram_threshold=10, trigram_min_count=5, trigram_threshold=10, substitutions=dict(), data_directory='./data', model_directory='./model'): self.ndocs = ndocs self.phrase_min_count = phrase_min_count self.vocabulary_size = vocabulary_size self.bigram_min_count = bigram_min_count self.bigram_threshold = bigram_threshold self.trigram_min_count = trigram_min_count self.trigram_threshold = trigram_threshold self.substitutions = substitutions self.data_directory = data_directory self.model_directory = model_directory self.load_bad_phrases() def tokenize(self, text): return [token.lower() for token in word_tokenize(text)] def stream_sentences(self, texts, description="Streaming sentences ..."): with tqdm.tqdm(texts) as pbar: pbar.set_description(description) for text in pbar: for sentence in sent_tokenize(text): yield self.tokenize(sentence) def load_bad_phrases(self): with open("%s/bad-phrases.txt" % self.data_directory, mode='r', encoding='UTF-8') as fp: self.bad_phrases = set( [phrase.strip() for phrase in fp.readlines()]) def add_bad_phrase(self, phrase): self.bad_phrases.add(phrase) def save_bad_phrases(self): bad_phrases = list(self.bad_phrases) bad_phrases.sort() with open("%s/bad-phrases.txt" % self.data_directory, mode='w', encoding='UTF-8') as fp: for phrase in bad_phrases: fp.write("%s\n" % phrase) def train_phrasers(self, texts): bigrams = Phrases(self.stream_sentences( texts, description="Streaming text for bigram phraser ..."), min_count=self.bigram_min_count, threshold=self.bigram_threshold) #print("Training bigram phraser ...") self.bigram_phraser = Phraser(bigrams) #print("Collecting trigrams ...") trigrams = Phrases(self.bigram_phraser[self.stream_sentences( texts, description="Streaming text for trigram phraser ...")], min_count=self.trigram_min_count, threshold=self.trigram_threshold) #print("Training trigram phraser ...") self.trigram_phraser = Phraser(trigrams) def save_phrasers(self): path = os.path.join(self.model_directory, "bigram-phraser.pkl") self.bigram_phraser.save(path) path = os.path.join(self.model_directory, "trigram-phraser.pkl") self.trigram_phraser.save(path) def load_phrasers(self): path = os.path.join(self.model_directory, "bigram-phraser.pkl") self.bigram_phraser = Phraser.load(path) path = os.path.join(self.model_directory, "trigram-phraser.pkl") self.trigram_phraser = Phraser.load(path) def prepare_text(self, text): for key, value in self.substitutions.items(): text = text.replace(key, value) tokens = self.tokenize(text) tokens = self.bigram_phraser[tokens] tokens = self.trigram_phraser[tokens] return [token for token in tokens if not token in self.bad_phrases] def prepare_texts(self, texts): with tqdm.tqdm(texts) as pbar: pbar.set_description("Preparing texts ...") prepared_texts = [self.prepare_text(text) for text in pbar] return prepared_texts def keep_phrase(self, phrase, cnt): if "'" in phrase: return False for c in PUNCTUATION: if c in phrase: return False if phrase in self.bad_phrases: return False phrase_set = set(phrase) if SYMBOLS & phrase_set: return False if (LETTERS & set(phrase)) and cnt > self.phrase_min_count: return True return False def build_vocabulary(self, texts, save=False): self.ndocs = len(texts) with tqdm.tqdm(texts) as pbar: pbar.set_description("Building vocabulary over %d documents." % self.ndocs) phrase_map = {} for document in pbar: for phrase in document: if not phrase in phrase_map: phrase_map[phrase] = 0 phrase_map[phrase] += 1 phrases = list(phrase_map.keys()) phrases = sorted(phrases, key=lambda phrase: -phrase_map[phrase]) vocabulary = [ phrase for phrase in phrases if self.keep_phrase(phrase, phrase_map[phrase]) ] hyphenated = { phrase.replace('-', '_') for phrase in vocabulary if "-" in phrase } vocabulary = [ phrase for phrase in vocabulary if not phrase in hyphenated ][:self.vocabulary_size] if save: path = os.path.join( self.data_directory, "vocabulary-%d-%d-%d.tsv" % (len(texts), self.phrase_min_count, self.vocabulary_size)) fp = open(path, mode='w', encoding='UTF-8') for phrase in vocabulary: fp.write("%s\t%d\n" % (phrase, phrase_map[phrase])) fp.close() self.vocabulary = set(vocabulary) def load_vocabulary(self): path = os.path.join( self.data_directory, "vocabulary-%d-%d-%d.tsv" % (self.ndocs, self.phrase_min_count, self.vocabulary_size)) fp = open(path, mode='r', encoding='UTF-8') self.vocabulary = set([]) for line in fp: line = line.strip() if line: phrase, cnt = line.split('\t') self.vocabulary.add(phrase) fp.close() def build_document(self, text): return [phrase for phrase in text if phrase in self.vocabulary] def build_corpus(self, texts): with tqdm.tqdm(texts) as pbar: pbar.set_description("Building corpus ...") corpus = [self.build_document(text) for text in pbar] return corpus def build_dictionary(self, corpus, save=False): self.dictionary = Dictionary(corpus) self.dictionary.filter_extremes(no_below=self.phrase_min_count, no_above=0.6, keep_n=self.vocabulary_size) if save: self.save_dictionary() def save_dictionary(self, path=None): if path is None: path = os.path.join(self.model_directory, "dictionary.pkl") self.dictionary.save(path) def load_dictionary(self, path=None): if path is None: path = os.path.join(self.model_directory, "dictionary.pkl") self.dictionary = Dictionary.load(path) def encode_corpus(self, corpus): return [self.dictionary.doc2bow(document) for document in corpus]
def nlp_preprocess(filepath_dict: dict, col: str, df=None, verbose: bool = True, overwrite_interim: bool = True) -> pd.DataFrame: def clean_doc(corpus): ''' generator function to read in docs from the file, and substitute and remove substrings ''' for doc in corpus: yield au_tu.remove_substrings(au_tu.clean_tokens( doc, tokens=to_replace_dict, whole_words_only=whole_words_only, ignore_case=ignore_case, ), to_remove_list=to_remove_list, whole_words_only=whole_words_only, ignore_case=ignore_case) def tokenize_entities(parsed_doc): txt = parsed_doc.text for ent in parsed_doc.ents: txt = txt[:ent.start_char] + ent.text.replace( ' ', '_') + txt[ent.end_char:] return txt def cleaned_doc_corpus(corpus): ''' generator function to use spaCy to parse docs, clean docs, tokenize named entities, and yield documents ''' for parsed_doc in nlp.pipe(clean_doc(corpus), batch_size=nlp_batch_size, n_threads=nlp_n_threads): yield tokenize_entities(parsed_doc) def punct_space_more(token): ''' helper function to eliminate tokens that are pure punctuation or whitespace or digits or only 1 character ''' return ( token.is_punct or token.is_space or token.is_digit or token.text == "'s" or token.lemma_ == '-PRON-' or # token.lemma_ == 'say' or # token.lemma_ == 'tell' or # token.lemma_ == 'be' or len(token.text) <= 1) def line_doc(filename): ''' generator function to read in docs from the file, un-escape the original line breaks in the text, and do additional cleaning ''' def hyp_to_us(doc): return re.sub(r'\b-\b', '_', doc) def remove_punct(doc): # keep: alphanumberic (w), spaces (s), single quote, underscore return re.sub(r'[^\w\s\'_]+', '', doc) # with codecs.open(filename, encoding='utf_8') as f: with smart_open(filename) as f: for doc in f: yield remove_punct(hyp_to_us(doc.decode())).replace( '\\n', '\n') def lemmatized_sentence_corpus(filename): ''' generator function to use spaCy to parse docs, lemmatize the text, and yield sentences ''' for parsed_doc in nlp.pipe(line_doc(filename), batch_size=nlp_batch_size, n_threads=nlp_n_threads): for sent in parsed_doc.sents: yield ' '.join([ token.lemma_ for token in sent if not punct_space_more(token) ]) if verbose: logger.info(f'Working on text from: {col}') # # debug - only getting from the sample dataframe here # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy() df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].copy() nlp = spacy.load('en', disable=[]) # clean text and tokenize entities if verbose: logger.info('Cleaning docs...') df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values)) # remove 'the_' from NER tokens df_phrased[col] = df_phrased[col].apply( lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()])) if verbose: logger.info('\tDone.') # create & open a new file in write mode if verbose: logger.info('Saving documents, one per line...') doc_count = 0 with codecs.open(filepath_dict['doc_txt_filepath'], 'w', encoding='utf_8') as doc_txt_file: for doc in df_phrased[[col]].apply(lambda x: ' '.join(x), axis=1).tolist(): # write the doc as a line in the new file # escape newline characters in the original doc text doc_txt_file.write(doc.replace('\n', '\\n') + '\n') doc_count += 1 if verbose: logger.info( f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}" ) nlp = spacy.load('en', disable=['ner']) # lemmatize and save sentences if overwrite_interim: if verbose: logger.info( f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}" ) # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f: with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f: for sentence in lemmatized_sentence_corpus( filepath_dict['doc_txt_filepath']): f.write(sentence + '\n') if verbose: logger.info('Done.') unigram_sentences = LineSentence( filepath_dict['unigram_sentences_filepath']) if verbose: logger.info('Unigram examples:') for unigram_sentence in it.islice(unigram_sentences, 10, 20): logger.info(u' '.join(unigram_sentence)) logger.info('=' * 30) if verbose: logger.info('Finding bigram phrases') # create the bigram model bigram = Phrases(unigram_sentences, min_count=phrase_min_count, threshold=phrase_threshold, max_vocab_size=phrase_max_vocab_size, progress_per=phrase_progress_per, scoring=phrase_scoring, common_terms=phrase_common_terms) bigram_model = Phraser(bigram) bigram_model.save(filepath_dict['bigram_model_filepath']) if verbose: logger.info( f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}" ) # save bigram sentences with codecs.open(filepath_dict['bigram_sentences_filepath'], 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence( filepath_dict['bigram_sentences_filepath']) if verbose: logger.info('Bigram examples:') for bigram_sentence in it.islice(bigram_sentences, 10, 20): logger.info(u' '.join(bigram_sentence)) logger.info('=' * 30) if verbose: logger.info('Finding trigram phrases') # create the trigram model trigram = Phrases(bigram_sentences, min_count=phrase_min_count, threshold=phrase_threshold, max_vocab_size=phrase_max_vocab_size, progress_per=phrase_progress_per, scoring=phrase_scoring, common_terms=phrase_common_terms) trigram_model = Phraser(trigram) trigram_model.save(filepath_dict['trigram_model_filepath']) if verbose: logger.info( f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}" ) # create trigram sentences with codecs.open(filepath_dict['trigram_sentences_filepath'], 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence( filepath_dict['trigram_sentences_filepath']) if verbose: logger.info('Trigram examples:') for trigram_sentence in it.islice(trigram_sentences, 10, 20): logger.info(u' '.join(trigram_sentence)) logger.info('=' * 30) if verbose: logger.info( f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}" ) # using saved models, write transformed text out to a new file, one doc per line with codecs.open(filepath_dict['trigram_docs_filepath'], 'w', encoding='utf_8') as f: for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']), batch_size=nlp_batch_size, n_threads=nlp_n_threads): # removing punctuation and whitespace unigram_doc = [ token.lemma_ for token in parsed_doc if not punct_space_more(token) ] # apply the first-order and second-order phrase models bigram_doc = bigram_model[unigram_doc] trigram_doc = trigram_model[bigram_doc] # remove any remaining stopwords trigram_doc = [ term for term in trigram_doc if term not in nlp.Defaults.stop_words ] #extend the stop workds stop_words_extend = [ 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come' ] trigram_doc = [ term for term in trigram_doc if term not in stop_words_extended ] # write the transformed doc as a line in the new file trigram_doc = ' '.join(trigram_doc) f.write(trigram_doc + '\n') if verbose: logger.info('Done.') # put the text back in the dataframe trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath']) if len([doc for doc in trigram_docs]) == df_phrased.shape[0]: for i, doc in enumerate(trigram_docs): df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc) else: raise ValueError( 'Different number of processed and original documents') # save dataframe if verbose: logger.info('Saving NLP processed data: {}'.format( filepath_dict['filepath_out'])) df_phrased.to_csv(filepath_dict['filepath_out']) return df_phrased
['high_fat', 'intake'], ['type_of', 'care'], ['population_based', 'SNP'], ['anal_gland', 'neoplasms'], ['acute_myelocytic', 'leukemia'], ['Samson_Gardner', 'syndrome'], ['colon_mucinous', 'adenocarcinoma']] phrases = Phrases(contents, threshold=0.25, scoring="npmi", custom_bigrams=my_bigrams) bigram = Phraser(phrases) tri_phase = Phrases(bigram[contents], custom_bigrams=my_trigrams, threshold=0.25, scoring="npmi") trigram = Phraser(tri_phase) sent = [ u'red', u'shift', u'square', u'pants', u'bit', u'parts', u'transverse', u'colon', u'cancer', u'trans', u'atlantic', u'ocean' ] print(bigram[sent]) print trigram[bigram[sent]] #print item bigram.save('./preprocessed_big_phrases') print "ngrams saved" trigram.save('./preprocessed_trigram_phrases') print "ngrams saved"
#The output file name modelFile = "Feb2017FullCorpus300D" #Original corpus available at: http://files.pushshift.io/reddit/comments/ (RC_2017-02.BZ2) #Retreieves the corpus file. This corpus was generated using the output from the upgradedCleaner.py file sentences = MySentences("Data\\Feb2017.txt") #Creates a Phrases object from the corpus myPhrases = Phrases(sentences, min_count=20) #Creates a much smaller Phraser object from the Phrases object bigram_transformer = Phraser(myPhrases) #Saves it so you don't have to redo this every time. bigram_transformer.save("Feb2017BigramTransformer") bigram_transformer = Phraser.load("Feb2017BigramTransformer") #Create and save the actual model model = Word2Vec(PhrasingIterable(bigram_transformer, "Data\\Feb2017.txt"), min_count=15, workers=4, size=300, window=8) model.save('Models\\' + modelFile) model = Word2Vec.load('Models\\' + modelFile) # Accuracy tests model.accuracy('questions-words.txt') testingSuite(modelFile)
def load_data(data_folder, use_old_models): try: 1 / use_old_models # if use_old_models=0, then this fails clean_data = pd.read_csv(os.path.join(data_folder, "clean_data.csv"), encoding="ISO-8859-1") nlp_dict = corpora.Dictionary.load( os.path.join(data_folder, 'nlp_dict.dict')) processed_texts = np.load( os.path.join(data_folder, "processed_texts.npy")).tolist() print("loaded preprocessed df") bigram = Phraser.load(os.path.join(data_folder, 'bigram')) except: print("new preprocessing") cols_to_use = [ 'age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0', 'essay1', 'essay2', 'essay5', 'essay6', 'essay7', 'ethnicity', 'income', 'job', 'orientation', 'pets', 'religion', 'smokes', 'speaks', 'status' ] data = pd.read_csv(os.path.join(data_folder, "profiles.csv"), usecols=cols_to_use) data.columns = [ 'age', 'text_body_type', 'text_diet', 'text_drinks', 'text_drugs', 'text_education', 'text_self_sum', 'text_life', 'text_goodat', 'text_6things', 'text_thinking', 'text_friday', 'text_ethnicity', 'income', 'text_job', 'text_orientation', 'text_pets', 'text_religion', 'text_smokes', 'text_speaks', 'text_status' ] # cleaning diet data["text_diet"] = data.text_diet.str.replace( "strictly ", "").str.replace("mostly ", "").str.replace("other", "anything") data["text_diet"] = data.text_diet.replace(np.nan, "anything") # cleaning body type data["text_body_type"] = data.text_diet.replace(np.nan, "average") # cleaning drinks data["text_drinks"] = data.text_drinks.replace(np.nan, "socially") # cleaning drugs data["text_drugs"] = data.text_drugs.replace(np.nan, "never") # cleaning education data["text_education"] = data.text_education.replace( np.nan, "high school") data.loc[data.text_education.str.contains("space"), "text_education"] = "high school" searchfor = ['university', 'college'] data.loc[data.text_education.str.contains('|'.join(searchfor)), "text_education"] = 'bachelor' searchfor = ['masters', 'law', 'med'] data.loc[data.text_education.str.contains('|'.join(searchfor)), "text_education"] = 'masters' data.loc[data.text_education.str.contains('ph.d'), "text_education"] = 'ph.d' data.loc[data.text_education.str.contains('high school'), "text_education"] = 'high school' clean_data = data columns_with_text = [ each_text_col for each_text_col in clean_data.columns.tolist() if "text" in each_text_col ] for each_text_col in columns_with_text: clean_data[each_text_col] = clean_data[each_text_col].replace( np.nan, "") clean_data[each_text_col].apply(str) clean_data['all_texts'] = clean_data[columns_with_text].apply( lambda x: ' / '.join(x), axis=1) clean_data = clean_data[(clean_data["all_texts"].str.len() < 21000) & (clean_data["all_texts"].str.len() > 860)] # want text of a minimum size clean_data.to_csv(os.path.join(data_folder, "clean_data.csv")) # train the bigram bigram = Phraser( Phrases(text_tokenize_gen(clean_data.all_texts.values.tolist()))) processed_texts = [[ text for text in my_lemmatize(texts) ] for texts in bigram[text_tokenize_gen(clean_data.all_texts.values)]] np.save(os.path.join(data_folder, "processed_texts"), processed_texts) bigram.save(os.path.join(data_folder, 'bigram')) nlp_dict = corpora.Dictionary(processed_texts) # in case you want to filter out some words nlp_dict.filter_extremes(no_below=0.1, no_above=0.4) # store the dictionary, for future reference nlp_dict.save(os.path.join(data_folder, 'nlp_dict.dict')) nlp_dict = nlp_dict.load(os.path.join(data_folder, 'nlp_dict.dict')) bigram["high school".split()] # at least I know it works return nlp_dict, bigram, clean_data, processed_texts