예제 #1
0
def generateBigrams(sentences):
    bigram_transformer = Phrases(sentences, min_count=20, threshold=500)
    bigram_transformer.save("bigrams", pickle_protocol=3)

    fd = open("bigrams.txt", 'a')
    for phrase, score in bigram_transformer.export_phrases(sentences):
        fd.write(u'{0}   {1}'.format(phrase, score))
    fd.close()

    return bigram_transformer
예제 #2
0
def extract_phrases(reviews_sents, reviews_docs, save=False):
    logging.info("Extracting phrases...")
    bigram = Phrases(reviews_sents, threshold=5, min_count=5)
    trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3)
    if save:
        with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout:
            ph_dic = {}
            for phrase, score in bigram.export_phrases(reviews_sents):
                ph_dic[phrase] = score
            for phrase, score in trigram.export_phrases(bigram[reviews_sents]):
                ph_dic[phrase] = score
            for phrase, score in ph_dic.items():
                if re.search(r'\d+', phrase):  # remove digits
                    continue
                phrase = b"_".join(phrase.split(b' '))
                fout.write(phrase + b'\n')
        bigram.save("../model/bigram.model")
        trigram.save("../model/trigram.model")

    return trigram[bigram[reviews_docs]]
예제 #3
0
def phrs_model(sentences):
    '''
    Generate Phrases model to find potential phrases,
    save its phrases into csv file

    Input:
    sentences(list of list of words): sentences without stop words
    '''
    model_ph = Phrases(sentences)
    #model_ph.save(PHRS_MODEL_NAME)
    gensim_phrs = model_ph.export_phrases(sentences)
    gensim_phrs = list(set(gensim_phrs))
    gensim_phrs = [g[0].decode("utf-8") for g in gensim_phrs \
                                    if g[0].split()[0]!=g[0].split()[1]]

    with open(PHRS_OUTFILE, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        sent = set()
        for i in gensim_phrs:
            if i not in sent:
                writer.writerow([i])
                sent |= {i}
예제 #4
0
if REGENERATE:

	print("Generating data from scratch.")

	texts = pickle.load(open(OUTFILE, 'rb'))[0]

	# This splits your list of texts into a list of sentences
	# At this point (in the training data) document borders
	# are removed.

	sentences = [item for text in texts for item in PunktSentenceTokenizer().tokenize(text.decode("utf8"))]
	sentences = [i.strip(' \n,.;:').replace('\n', ' ').split(' ') for i in sentences]

	# Create and train bigram/trigram converters
	unigram = Phrases(sentences, threshold=float("inf"))
	unigrams = unigram.export_phrases(sentences)

	grams = [unigram]

	sentences_copy = sentences

	threshold = 9.0

	while True:
		bigram = Phrases(sentences_copy, threshold=threshold)
		bigrams = bigram.export_phrases(sentences_copy)
		z = list(set(bigrams) - set(unigrams))
		if len(z) == 0:
			break
		else:
			sentences_copy = bigram[sentences_copy]
예제 #5
0
def extract_collocations(documents_lemmas, min_count, threshold):
    bigram = Phrases(documents_lemmas,
                     min_count=min_count,
                     threshold=threshold)
    return list(bigram.export_phrases(documents_lemmas))
예제 #6
0
            token_regex=token_regex,
            n_process=n_process,
            stopwords=None,  # do not delete until doing connection below
        )

        phraser = Phrases(
            tqdm((toks for toks, id in doc_tokens), total=total_docs),
            delimiter="~",
            min_count=min_count,
            threshold=threshold,
            max_vocab_size=max_vocab_size,
            progress_per=float("inf"),
            connector_words=connector_words,
        )

        # for future passes
        entities_and_noun_chunks = [
            w for w in phraser.vocab if '_' in w and '~' not in w
        ]
        detected_phrases = [
            w.replace("~", "_") for w in phraser.export_phrases()
        ]
        phrases = detected_phrases + entities_and_noun_chunks
        detect_entities = False  # these will have been added to `phrases`
        detect_noun_chunks = False

    if max_phrase_len:
        phrases = [p for p in phrases if p.count("_") + 1 <= max_phrase_len]

    return list(set(phrases))
예제 #7
0
import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser

sentences = []
f = io.open('../_work_align/20-zh-tocolloc.txt', encoding='utf-8')
for line in f:
    sentences.append(line.split())

phrases = Phrases(sentences, scoring='npmi', threshold=0.5)
g = io.open('../_work_align/21-tmp-zh-bigrams.txt', mode='w', encoding='utf-8')
# for key in phrases.vocab.keys():
#   g.write(key.encode('utf-8'))
#   g.write('\n')

for phrase, score in phrases.export_phrases(sentences):
    g.write(str(phrase, 'utf-8'))
    g.write('\t')
    g.write(str(score))
    g.write('\n')

phraser = Phraser(phrases)
h = io.open('../_work_align/21-tmp-zh-seg-bi.txt', mode='w', encoding='utf-8')
for sent in sentences:
    grams = phraser(sent)
    first = True
    for gram in grams:
        if not first: h.write(' ')
        first = False
        h.write(gram)
    h.write('\n')