def generateBigrams(sentences): bigram_transformer = Phrases(sentences, min_count=20, threshold=500) bigram_transformer.save("bigrams", pickle_protocol=3) fd = open("bigrams.txt", 'a') for phrase, score in bigram_transformer.export_phrases(sentences): fd.write(u'{0} {1}'.format(phrase, score)) fd.close() return bigram_transformer
def extract_phrases(reviews_sents, reviews_docs, save=False): logging.info("Extracting phrases...") bigram = Phrases(reviews_sents, threshold=5, min_count=5) trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3) if save: with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout: ph_dic = {} for phrase, score in bigram.export_phrases(reviews_sents): ph_dic[phrase] = score for phrase, score in trigram.export_phrases(bigram[reviews_sents]): ph_dic[phrase] = score for phrase, score in ph_dic.items(): if re.search(r'\d+', phrase): # remove digits continue phrase = b"_".join(phrase.split(b' ')) fout.write(phrase + b'\n') bigram.save("../model/bigram.model") trigram.save("../model/trigram.model") return trigram[bigram[reviews_docs]]
def phrs_model(sentences): ''' Generate Phrases model to find potential phrases, save its phrases into csv file Input: sentences(list of list of words): sentences without stop words ''' model_ph = Phrases(sentences) #model_ph.save(PHRS_MODEL_NAME) gensim_phrs = model_ph.export_phrases(sentences) gensim_phrs = list(set(gensim_phrs)) gensim_phrs = [g[0].decode("utf-8") for g in gensim_phrs \ if g[0].split()[0]!=g[0].split()[1]] with open(PHRS_OUTFILE, 'w', newline='') as csvfile: writer = csv.writer(csvfile) sent = set() for i in gensim_phrs: if i not in sent: writer.writerow([i]) sent |= {i}
if REGENERATE: print("Generating data from scratch.") texts = pickle.load(open(OUTFILE, 'rb'))[0] # This splits your list of texts into a list of sentences # At this point (in the training data) document borders # are removed. sentences = [item for text in texts for item in PunktSentenceTokenizer().tokenize(text.decode("utf8"))] sentences = [i.strip(' \n,.;:').replace('\n', ' ').split(' ') for i in sentences] # Create and train bigram/trigram converters unigram = Phrases(sentences, threshold=float("inf")) unigrams = unigram.export_phrases(sentences) grams = [unigram] sentences_copy = sentences threshold = 9.0 while True: bigram = Phrases(sentences_copy, threshold=threshold) bigrams = bigram.export_phrases(sentences_copy) z = list(set(bigrams) - set(unigrams)) if len(z) == 0: break else: sentences_copy = bigram[sentences_copy]
def extract_collocations(documents_lemmas, min_count, threshold): bigram = Phrases(documents_lemmas, min_count=min_count, threshold=threshold) return list(bigram.export_phrases(documents_lemmas))
token_regex=token_regex, n_process=n_process, stopwords=None, # do not delete until doing connection below ) phraser = Phrases( tqdm((toks for toks, id in doc_tokens), total=total_docs), delimiter="~", min_count=min_count, threshold=threshold, max_vocab_size=max_vocab_size, progress_per=float("inf"), connector_words=connector_words, ) # for future passes entities_and_noun_chunks = [ w for w in phraser.vocab if '_' in w and '~' not in w ] detected_phrases = [ w.replace("~", "_") for w in phraser.export_phrases() ] phrases = detected_phrases + entities_and_noun_chunks detect_entities = False # these will have been added to `phrases` detect_noun_chunks = False if max_phrase_len: phrases = [p for p in phrases if p.count("_") + 1 <= max_phrase_len] return list(set(phrases))
import gensim from gensim.models import Phrases from gensim.models.phrases import Phraser sentences = [] f = io.open('../_work_align/20-zh-tocolloc.txt', encoding='utf-8') for line in f: sentences.append(line.split()) phrases = Phrases(sentences, scoring='npmi', threshold=0.5) g = io.open('../_work_align/21-tmp-zh-bigrams.txt', mode='w', encoding='utf-8') # for key in phrases.vocab.keys(): # g.write(key.encode('utf-8')) # g.write('\n') for phrase, score in phrases.export_phrases(sentences): g.write(str(phrase, 'utf-8')) g.write('\t') g.write(str(score)) g.write('\n') phraser = Phraser(phrases) h = io.open('../_work_align/21-tmp-zh-seg-bi.txt', mode='w', encoding='utf-8') for sent in sentences: grams = phraser(sent) first = True for gram in grams: if not first: h.write(' ') first = False h.write(gram) h.write('\n')