def train(args): # Output during training logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # use text8 corpus as training data, haikus dont provide sufficient context training_data = api.load('text8') # use the phrase model to recognize bigrams like "White House" or "Climate Change" bigram_model = Phrases(training_data) # Export the trained model = use less RAM, faster processing. Model updates no longer possible. bigrams = Phraser(bigram_model) # # create and train model model = Word2Vec(bigrams[training_data], size=args.embedding_dim) word_list = list(model.wv.vocab.keys()) vector_list = [model[word] for word in word_list] # the basic model doesnt seem to be supporting item assignment # but WordEmbeddingsKeyedVectors does kv = WordEmbeddingsKeyedVectors(args.embedding_dim) kv.add(word_list, vector_list) kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim)) # just to be safe, clear the cache of normalized vectors # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532 del kv.vectors_norm # save the new models bigrams.save(f"{args.model_path}/bigram.model") kv.save(f"{args.model_path}/word2vec.model")
def extract_corpus(column): try: corpus = column except Exception as e: return print(e) ## create empty list of lists of unigrams lst_corpus = [] try: for string in corpus: lst_words = string.split() lst_grams = [ " ".join(lst_words[i:i + 1]) for i in range(0, len(lst_words), 1) ] lst_corpus.append(lst_grams) ## detect bigrams and trigrams bigrams_detector = Phrases(lst_corpus, delimiter=" ".encode(), min_count=10, threshold=10) bigrams_detector = Phraser(bigrams_detector) trigrams_detector = Phrases(bigrams_detector[lst_corpus], delimiter=" ".encode(), min_count=15, threshold=10) trigrams_detector = Phraser(trigrams_detector) ## detect common bigrams and trigrams using the fitted detectors lst_corpus = list(bigrams_detector[lst_corpus]) lst_corpus = list(trigrams_detector[lst_corpus]) return lst_corpus except Exception as e: return print(e)
def trigramGenerator(self): corpusStream = self.sentenceStream() biGramPhrases = Phrases(corpusStream, min_count=self.bigramMinCount, threshold=self.thresholdBigram) bigram = Phraser(biGramPhrases) inputStream = self.sentenceStream() bigramSentenceList = (bigram[sentence] for sentence in inputStream) triGramPhrases = Phrases(bigramSentenceList, min_count=self.trigramMinCount, threshold=self.thresholdTrigram) trigram = Phraser(triGramPhrases) inputStream = self.sentenceStream() bigramSentenceList = (bigram[sentence] for sentence in inputStream) trigramSentenceList = (trigram[sentence] for sentence in bigramSentenceList) trigramList = set() for trigramSentence in trigramSentenceList: for item in trigramSentence: if "_" in item: trigramList.add(item) print("Number of Unique Trigrams = ", len(trigramList)) for item in sorted(trigramList): if not os.path.exists(self.trainingLocation): os.makedirs(self.trainingLocation) with open( os.path.join(self.trainingLocation, "TC-phrases-bi-tri.txt"), "a") as outFile: outFile.write(item + "\n")
def fit(self, sentencesPath): """ train phrases :param sentencesPath:the path of text file, the text file should be the format: one line one sentence """ self.phrasers = [] # path detect for path in self.savePhraserPaths: if not os.path.exists(os.path.dirname(path)): raise FileNotFoundError(os.path.dirname(path) + " not exist") for path in self.savePhraserPaths: if not os.path.exists(path): # need train self.phrasers = None break if self.phrasers is not None and self.file_overwrite == False: logging.info("models are already exist, will read it") for path in self.savePhraserPaths: self.phrasers.append(Phraser.load(path)) return True self.phrasers = [] c = 2 for path in self.savePhraserPaths: logging.info("getting %d-gram phrase......" % c) c += 1 phraser = Phraser( Phrases(sentences=TxtIter(sentences=codecs.open( sentencesPath, mode="r", encoding="utf-8"), ngrams=self.phrasers), min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, scoring=self.scoring)) phraser.save(path) self.phrasers.append(phraser)
def get_text_search_terms(keywords, synonyms_threshold, fasttext_model): bi_gram_model = Phraser.load('src/models/bi_gram_model.pkl') tri_gram_model = Phraser.load('src/models/tri_gram_model.pkl') # clean tokens cleaned_terms = clean_tokenized_sentence(keywords.split(' ')).split() # remove empty terms cleaned_terms = [term for term in cleaned_terms if term] # stem terms cleaned_terms = [ps.stem(term) for term in cleaned_terms] # create bi-grams terms_with_bigrams = bi_gram_model[' '.join(cleaned_terms).split(' ')] # create tri-grams terms_with_trigrams = tri_gram_model[terms_with_bigrams] # expand query with synonyms search_terms = [ fasttext_model.wv.most_similar(token) for token in terms_with_trigrams ] # filter synonyms above threshold (and flatten the list of lists) search_terms = [ synonym[0] for synonyms in search_terms for synonym in synonyms if synonym[1] >= synonyms_threshold ] # expand keywords with synonyms search_terms = list(terms_with_trigrams) + search_terms return search_terms
def getTrigramList(g_DataQueue, g_FinishRead, savePath, bigramPath, trigramPath): """ :param g_DataQueue: :param g_FinishRead: :param savePath:保存字典路径 :param bigramPath: :param trigramPath: :return: """ count = 0 vocabulary_dic = {} bigram = Phraser(Phrases.load(bigramPath)) trigram = Phraser(Phrases.load(trigramPath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): words = g_DataQueue.get() count += len(words) print("have processed sentences:", count) # 获取短语 trigram_list = trigram[bigram[words]] del words gc.collect() # 放入字典中 for phrase_list in trigram_list: for phrase in phrase_list: if phrase not in vocabulary_dic: vocabulary_dic[phrase] = 0 vocabulary_dic[phrase] += 1 # 存入本地 fw = codecs.open(savePath, "w", encoding="utf-8") fw.write(json.dumps(vocabulary_dic)) fw.close() del vocabulary_dic gc.collect()
def main(): get_args() def sentences(): return chain.from_iterable( (read_slice(data) for data in read_corpus())) bigram = Phrases(sentences(), min_count=1, threshold=1, delimiter=b' ') bigram_phraser = Phraser(bigram) bigrammed = map(lambda x: bigram_phraser[x], sentences()) trigram = Phrases(bigrammed, min_count=1, threshold=1, delimiter=b' ') trigram_phraser = Phraser(trigram) only_trigrams = {b' '.join(trigram_tuple): score for (trigram_tuple, score) in \ trigram_phraser.phrasegrams.items() if b' '.join(trigram_tuple).count(b' ') == 2} for key, value in sorted(only_trigrams.items(), key=lambda item: item[1], reverse=True)[:10]: print(key, value) scores = list(only_trigrams.values()) print(""" Unique trigrams: {unique} Mean score:{mean} Max score:{max} Min score:{min} """.format(unique=len(only_trigrams), mean=mean(scores) if len(scores) != 0 else 0, max=max(scores) if len(scores) != 0 else 0, min=min(scores) if len(scores) != 0 else 0))
def get_bigram_phraser(directory): if os.path.isfile(BIGRAM): return Phraser.load(BIGRAM) else: bigram = Phraser(Phrases(corpus(directory))) bigram.save(BIGRAM) return bigram
def phrasing_sentences(sentences): phrases_bi = Phrases(sentences, min_count=5, threshold=1) bigram = Phraser(phrases_bi) sentences = map(lambda x: x, bigram[sentences]) phrases_tri = Phrases(sentences, min_count=5, threshold=1) trigram = Phraser(phrases_tri) return map(lambda x: x, trigram[sentences])
def _preprocess(self, text, min_tok_len=1): stop_words = set(nltk.corpus.stopwords.words('english')) lemm_stemm = lambda tok: WordNetLemmatizer().lemmatize(tok, pos='v') result = [] #remove proper nouns tagged_sent = pos_tag(text.split()) noProper = [word for word, pos in tagged_sent if pos != 'NNP'] noProper = ' '.join(noProper) for token in simple_preprocess(noProper): if len(token) > min_tok_len and token not in stop_words: result.append(lemm_stemm(token)) # Build the bigram and trigram models bigram = Phrases(result, min_count=5, threshold=10) # higher threshold fewer phrases. trigram = Phrases(bigram[result], threshold=10) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) result = trigram_mod[bigram_mod[result]] return [result]
def tokeniseAll(posts, stopWords, urduNames): '''Function to tokenise all comments in the file, including ngrams Parameters --------------------------------------- comments: the pandas data frame column containing the comments, transformed into a list stopWords: A list of stopwords urduNames: A list of common Urdu names''' #posts = comments.tolist() n_grams = 3 tokenized_corp = [] for doc in posts: tokenized_corp.append(createToken(doc, stopWords, urduNames)) # Add n_grams bigram = Phrases(tokenized_corp, min_count=5, threshold=10) trigram = Phrases(bigram[tokenized_corp], threshold=10) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) if n_grams > 1: for i, doc in enumerate(tokenized_corp): tokenized_corp[i] = bigram_mod[doc] if n_grams > 2: tokenized_corp[i] = trigram_mod[bigram_mod[doc]] return tokenized_corp
def visulaizer_of_gensim(content_list): stop_words = stopwords.words('english') data_words = list(sent_to_words(content_list)) bigram = Phrases(data_words, min_count=5, threshold=100) trigram = Phrases(bigram[data_words], threshold=100) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) data_words_nostops = remove_stopwords(data_words, stop_words) data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod) data_words_trigrams = make_trigrams(data_words_bigrams, bigram_mod, trigram_mod) data_lemmatized = lemmatization( data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) id2word = corpora.Dictionary(data_lemmatized) texts = data_lemmatized corpus = [id2word.doc2bow(text) for text in texts] lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return vis
def __init__(self, sentences_file: str, bigram_model_path: str, trigram_model_path: str, fasttext_model_path: str): print(f'Loading CSV: {sentences_file} and building mapping dictionary...') sentences_df = pd.read_csv(sentences_file) self.sentence_id_to_metadata = {} for row_count, row in sentences_df.iterrows(): self.sentence_id_to_metadata[row_count] = dict( paper_id=row['paper_id'], cord_uid=row['cord_uid'], source=row['source'], publish_time=row['publish_time'], authors=row['authors'], section=row['section'], sentence=row['sentence'], ) print(f'Finished loading CSV: {sentences_file} and building mapping dictionary') self.cleaned_sentences = sentences_df['cleaned_sentence'].tolist() print(f'Loaded {len(self.cleaned_sentences)} sentences') print(f'Loading bi-gram model: {bigram_model_path}') self.bigram_model = Phraser.load(bigram_model_path) print(f'Finished loading bi-gram model: {bigram_model_path}') print(f'Loading tri-gram model: {trigram_model_path}') self.trigram_model = Phraser.load(trigram_model_path) print(f'Finished loading tri-gram model: {trigram_model_path}') self.synonyms_model = Synonyms(fasttext_model_path)
def split(self): start = time() n_gram = self.grams ap_text = self.series.apply(self.clean) ap_text_list = [i.split() for i in ap_text] print(len(ap_text_list)) print('used: {:.2f}s'.format(time() - start)) if n_gram == 1: self.prepared = ap_text_list elif n_gram == 2: phs = Phrases(ap_text_list) bi_gram = Phraser(phs) new_bi_list = [bi_gram[i] for i in ap_text_list] self.prepared = new_bi_list else: phs = Phrases(ap_text_list) bi_gram = Phraser(phs) new_bi_list = [bi_gram[i] for i in ap_text_list] phs3 = Phrases(new_bi_list) tri_gram = Phraser(phs3) new_tri_list2 = [tri_gram[i] for i in new_bi_list] self.prepared = new_tri_list2
def create_bigram_and_trigram(sentences): bigram = Phrases(sentences, min_count=10, threshold=10, delimiter=b' ') bigram_phraser = Phraser(bigram) bigramer = bigram_phraser[sentences] trigram = Phrases(bigram_phraser[sentences], min_count=10, threshold=10, delimiter=b' ') trigram_phraser = Phraser(trigram) trigramer = trigram_phraser[bigramer] return trigramer #the trigamer also include trigrams and bigrams
def __init__(self, tsv_path, n_examples=100000): print("Getting %s iterator..." % tsv_path) self.n_examples = n_examples self.document_path = tsv_path self.fin = open(self.document_path, 'rb') self.instances = sum(1 for line in open(tsv_path)) self.bigram = Phraser(Phrases()) self.trigram = Phraser(Phrases())
def create_ngram_models(documents): bigram = Phrases(documents, min_count=5, threshold=100) trigram = Phrases(bigram[documents], threshold=100) bigram_model = Phraser(bigram) trigram_model = Phraser(trigram) return bigram_model, trigram_model
def make_ngrams_model(tokenized_sentences, set_min_count=30, set_threshold=80): bigram = Phrases(tokenized_sentences, min_count=set_min_count, threshold=set_threshold) trigram = Phrases(bigram[tokenized_sentences], threshold=set_threshold) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) return bigram_mod, trigram_mod
def load_phrasers(directory=MODEL_DIRECTORY): path = os.path.join(directory, "bigram-phraser.pkl") bigram_phraser = Phraser.load(path) path = os.path.join(directory, "trigram-phraser.pkl") trigram_phraser = Phraser.load(path) return bigram_phraser, trigram_phraser
def train_model(texts: List[Text], savedir: PathType) -> None: print(f"Trainning of phraser model") texts = [t.split() for t in texts] phrases = [Phraser(Phrases(texts, min_count=100, delimiter=b'_'))] corpus = [phrases[-1][texts]] for n in range(3, 7): save_phraser(phrases[-1], os.path.join(savedir, f"{n-1}gramsphraser")) phrases.append(Phraser(Phrases(corpus[-1], delimiter=b'_'))) corpus.append(phrases[-1][corpus[-1]])
def from_file(cls, dict_fname, phraser_fname=None): """Load tokenizer information from a dictionary file (generated by gensim dictionary.save) and a phraser file.""" d = Dictionary.load(str(dict_fname)) if phraser_fname is not None: p = Phraser.load(phraser_fname) else: p = Phraser(Phrases([[]])) return cls(d, p)
def get_trigram_phraser(directory): if os.path.isfile(TRIGRAM): return Phraser.load(TRIGRAM) else: bigram = get_bigram_phraser(directory) sentence_stream = (bigram[sentence] for sentence in corpus(directory)) trigram = Phraser(Phrases(sentence_stream)) trigram.save(TRIGRAM) return trigram
def testEmptyPhrasifiedSentencesIterator(self): bigram_phrases = Phrases(self.sentences) bigram_phraser = Phraser(bigram_phrases) trigram_phrases = Phrases(bigram_phraser[self.sentences]) trigram_phraser = Phraser(trigram_phrases) trigrams = trigram_phraser[bigram_phraser[self.sentences]] fst, snd = list(trigrams), list(trigrams) self.assertEqual(fst, snd) self.assertNotEqual(snd, [])
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system'])
def __init__(self): #read the bigram and trigram objects with open(r"../model/bigram_transformer.pickle", "rb") as input_file: bigram_transformer = pickle.load(input_file) with open(r"../model/trigram_transformer.pickle", "rb") as input_file2: trigram_transformer = pickle.load(input_file2) self.bigram_phraser = Phraser(bigram_transformer) self.trigram_phraser = Phraser(trigram_transformer)
def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]], ['graph_minors', 'survey', 'human_interface', 'system'])
def make_trigrams( sentences: Iterable, save_model_path: Path, **phrases_kw ): """Entrena modelo de bigramas de gensim.""" bigram = Phrases(sentences, **phrases_kw) bigram_phraser = Phraser(bigram) tokens = bigram_phraser[sentences] trigram = Phrases(tokens, delimiter=b" ") trigram_phraser = Phraser(trigram) trigram_phraser.save(str(save_model_path))
def make_trigrams(self): bigram = Phrases(self.sent_to_words(), min_count=5, threshold=100) bigram_mod = Phraser(bigram) bigram_data_words = [ bigram_mod[doc] for doc in self.remove_stopwords() ] trigram = Phrases(bigram[self.sent_to_words()], threshold=100) trigram_mod = Phraser(trigram) return [trigram_mod[bigram_mod[doc]] for doc in bigram_data_words]
def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.source = source self.max_sentence_length = max_sentence_length self.limit = limit self.bigram = Phraser.load('./preprocessed_big_phrases') self.trigram = Phraser.load('./preprocessed_trigram_phrases')
def mk_bigrams(): with open(dump_base + "judgments", 'r', encoding="utf-8") as f: judgments = f.read() sentences = [list(gensim.utils.simple_tokenize(s)) for s in textcleaner.split_sentences(judgments)] bigramer = Phraser(Phrases(sentences)) bigramer.save(dump_base + "bigramer") return [bigramer[s] for s in sentences]
def train_phraser(sentence_stream, stopword_list, threshold, model_path, save_prefix): phrases_model = Phrases(sentence_stream, common_terms=stopword_list, threshold=threshold) phrases_model.save( os.path.join(model_path, '{}_phrases.bin'.format(save_prefix))) phraser_model = Phraser(phrases_model) phraser_model.save( os.path.join(model_path, '{}_phraser.bin'.format(save_prefix))) return phraser_model
def testCompatibilty(self): phr = Phraser.load(datapath("phraser-3.6.0.model")) model = Phrases.load(datapath("phrases-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] expected_res = ['trees', 'graph_minors'] phr_out = phr[test_sentences] model_out = model[test_sentences] self.assertEqual(phr_out, expected_res) self.assertEqual(model_out, expected_res)
def build_phrases(self): threads = ReadThreads( self.board, self.input_dir, return_func=lambda x, y: (x, y.split())) filename = op.join(self.input_dir, f'{self.board}.trigrams') trigram_mod = Phraser.load(filename) filename = op.join(self.input_dir, f'{self.board}.phrases') with open(filename, 'wt') as f: for num, thread in threads: line = ' '.join([ word for word in trigram_mod[thread] if word not in STOPWORDS and len(word) >= 3 ]) print(f'{num}\t{line}', file=f)
def build_doc2vec_model(self, vectors: int=200): filename = op.join(self.input_dir, f'{self.board}.phraser') phraser = Phraser.load(filename) documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: TaggedDocument(phraser[y.split()], [x])) model = Doc2Vec(vector_size=vectors, window=2, min_count=5, workers=3) model.build_vocab(documents=documents) model.train( documents=documents, total_examples=model.corpus_count, epochs=model.iter, ) filename = op.join(self.input_dir, f'{self.board}.doc2vec') model.save(filename) return model
def build_phraser(self, threshold: int=None): tokens = ReadThreads( self.board, self.input_dir, return_func=lambda x, y: y.split()) bigram = Phrases(tokens, min_count=5, threshold=threshold) trigram = Phrases(bigram[tokens], threshold=threshold) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) filename = op.join(self.input_dir, f'{self.board}.bigrams') bigram_mod.save(filename) filename = op.join(self.input_dir, f'{self.board}.trigrams') trigram_mod.save(filename) return trigram_mod
def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phraser, before common_terms""" bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset())
def testSaveLoadNoScoring(self): """ Saving and loading a Phraser object with no scoring parameter. This should ensure backwards compatibility with old versions of Phraser""" bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer)