Пример #1
0
def get_bigram_phraser(directory):
    if os.path.isfile(BIGRAM):
        return Phraser.load(BIGRAM)
    else:
        bigram = Phraser(Phrases(corpus(directory)))
        bigram.save(BIGRAM)
        return bigram
Пример #2
0
def train(args):
    # Output during training
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # use text8 corpus as training data, haikus dont provide sufficient context
    training_data = api.load('text8')

    # use the phrase model to recognize bigrams like "White House" or "Climate Change"
    bigram_model = Phrases(training_data)
    # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    bigrams = Phraser(bigram_model)

    # # create and train model
    model = Word2Vec(bigrams[training_data], size=args.embedding_dim)

    word_list = list(model.wv.vocab.keys())
    vector_list = [model[word] for word in word_list]

    # the basic model doesnt seem to be supporting item assignment
    # but WordEmbeddingsKeyedVectors does
    kv = WordEmbeddingsKeyedVectors(args.embedding_dim)
    kv.add(word_list, vector_list)

    kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim))

    # just to be safe, clear the cache of normalized vectors
    # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532
    del kv.vectors_norm

    # save the new models
    bigrams.save(f"{args.model_path}/bigram.model")
    kv.save(f"{args.model_path}/word2vec.model")
 def fit(self, sentencesPath):
     """
     train phrases
     :param sentencesPath:the path of text file, the text file should be the format: one line one sentence
     """
     self.phrasers = []
     # path detect
     for path in self.savePhraserPaths:
         if not os.path.exists(os.path.dirname(path)):
             raise FileNotFoundError(os.path.dirname(path) + " not exist")
     for path in self.savePhraserPaths:
         if not os.path.exists(path):  # need train
             self.phrasers = None
             break
     if self.phrasers is not None and self.file_overwrite == False:
         logging.info("models are already exist, will read it")
         for path in self.savePhraserPaths:
             self.phrasers.append(Phraser.load(path))
         return True
     self.phrasers = []
     c = 2
     for path in self.savePhraserPaths:
         logging.info("getting %d-gram phrase......" % c)
         c += 1
         phraser = Phraser(
             Phrases(sentences=TxtIter(sentences=codecs.open(
                 sentencesPath, mode="r", encoding="utf-8"),
                                       ngrams=self.phrasers),
                     min_count=self.min_count,
                     threshold=self.threshold,
                     max_vocab_size=self.max_vocab_size,
                     delimiter=self.delimiter,
                     scoring=self.scoring))
         phraser.save(path)
         self.phrasers.append(phraser)
Пример #4
0
def train_w2v_model() -> (Phraser, Word2Vec):
    # Build Word2Vec model
    if not Path(model_file).exists():
        sent = [row.split() for row in df['clean_lyrics'] if row]
        # Build collocations
        if not Path(bigrams_file).exists():
            bigram_phrases = Phrases(sent,
                                     min_count=30,
                                     progress_per=10000,
                                     max_vocab_size=200000,
                                     common_terms=sentiment_terms)
            bigram = Phraser(bigram_phrases)
            bigram.save(bigrams_file)
            trigram_phrases = Phrases(bigram[sent],
                                      min_count=30,
                                      progress_per=10000,
                                      max_vocab_size=200000,
                                      common_terms=sentiment_terms)
            trigram = Phraser(trigram_phrases)
            trigram.save(trigrams_file)

        trigram = Phrases.load(trigrams_file)

        sentences = trigram[sent]

        cores = multiprocessing.cpu_count()
        w2v_model = Word2Vec(
            min_count=20,  # Remove rare words
            window=2,
            size=300,
            sample=6e-5,
            alpha=0.03,
            min_alpha=0.0007,
            negative=20,
            workers=cores - 1)

        t = time()

        w2v_model.build_vocab(sentences, progress_per=10000)

        print('Time to build vocab: {} mins'.format(round((time() - t) / 60,
                                                          2)))
        w2v_model.vocabulary.save(vocabulary_file)

        t = time()

        w2v_model.train(sentences,
                        total_examples=w2v_model.corpus_count,
                        epochs=30,
                        report_delay=1)

        print('Time to train the model: {} mins'.format(
            round((time() - t) / 60, 2)))

        w2v_model.save(model_file)
    trigram = Phrases.load(trigrams_file)
    w2v_model = Word2Vec.load(model_file)

    return trigram, w2v_model
Пример #5
0
 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
             ['graph_minors', 'survey', 'human_interface', 'system'])
Пример #6
0
def make_bi_tri(paths, tri=False):
    sentences = PathLineSentences(paths)
    phases = Phrases(sentences)
    bigram = Phraser(phases)
    bigram.save()
    if tri:
        triphases = Phrases(bigram[sentences])
        trigram = Phraser(triphases)
        trigram.save()
Пример #7
0
def get_trigram_phraser(directory):
    if os.path.isfile(TRIGRAM):
        return Phraser.load(TRIGRAM)
    else:
        bigram = get_bigram_phraser(directory)
        sentence_stream = (bigram[sentence] for sentence in corpus(directory))
        trigram = Phraser(Phrases(sentence_stream))
        trigram.save(TRIGRAM)
        return trigram
Пример #8
0
 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[[
                 'graph', 'minors', 'survey', 'human', 'interface', 'system'
             ]], ['graph_minors', 'survey', 'human_interface', 'system'])
Пример #9
0
    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
Пример #10
0
def make_trigrams(
    sentences: Iterable, save_model_path: Path, **phrases_kw
):
    """Entrena modelo de bigramas de gensim."""
    bigram = Phrases(sentences, **phrases_kw)
    bigram_phraser = Phraser(bigram)
    tokens = bigram_phraser[sentences]
    trigram = Phrases(tokens, delimiter=b" ")
    trigram_phraser = Phraser(trigram)
    trigram_phraser.save(str(save_model_path))
Пример #11
0
Файл: ex9.py Проект: mat-hek/pjn
def mk_bigrams():
    with open(dump_base + "judgments", 'r', encoding="utf-8") as f:
        judgments = f.read()

    sentences = [list(gensim.utils.simple_tokenize(s)) for s in textcleaner.split_sentences(judgments)]

    bigramer = Phraser(Phrases(sentences))

    bigramer.save(dump_base + "bigramer")

    return [bigramer[s] for s in sentences]
Пример #12
0
def ngram_model_to_disk(sents, output_fp):
    '''
    helper function that saves ngram model to disk and returns
    it for further use
    '''
    ngrams = Phrases(sents,
                     min_count=40,
                     common_terms=frozenset(en.STOP_WORDS))
    ngram_phraser = Phraser(ngrams)
    ngram_phraser.save(output_fp)
    return ngram_phraser
Пример #13
0
def train_phraser(sentence_stream, stopword_list, threshold, model_path,
                  save_prefix):
    phrases_model = Phrases(sentence_stream,
                            common_terms=stopword_list,
                            threshold=threshold)
    phrases_model.save(
        os.path.join(model_path, '{}_phrases.bin'.format(save_prefix)))
    phraser_model = Phraser(phrases_model)
    phraser_model.save(
        os.path.join(model_path, '{}_phraser.bin'.format(save_prefix)))
    return phraser_model
Пример #14
0
def create_n_grams(text):
    bigram = Phrases(text, min_count=20, threshold=10, delimiter=b' ')
    bigram_phraser = Phraser(bigram)
    bigram_phraser.save("./bigram_model.pkl")
    tokens_bigram = bigram_phraser[text]
    trigram = Phrases(tokens_bigram,
                      min_count=10,
                      threshold=10,
                      delimiter=b' ')
    trigram_phraser = Phraser(trigram)
    trigram_phraser.save("./trigram_model.pkl")
    tokens_trigram = trigram_phraser[tokens_bigram]
    return tokens_bigram, tokens_trigram
Пример #15
0
    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences,
                        min_count=1,
                        threshold=.001,
                        scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
Пример #16
0
class GramFacade:
    def __init__(self, model_dir, min_count_bigrams=8, min_count_trigrams=7):
        self.model_dir = model_dir
        self.min_count_bigrams = min_count_bigrams
        self.min_count_trigrams = min_count_trigrams

    def load_models(self):
        self.bigrams_phraser = Phraser.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser = Phraser.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASER_FILENAME)

    def load_phrases(self):
        self.bigrams_phrases = Phrases.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases = Phrases.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASES_FILENAME)

    def export_bigrams(self, docs):
        return [self.bigrams_phraser[doc] for doc in docs]

    def export_trigrams(self, bigrams):
        return [self.trigrams_phraser[bigram] for bigram in bigrams]

    def phrase(self, doc):
        bigrams = self.bigrams_phraser[doc]
        trigrams = self.trigrams_phraser[bigrams]
        return trigrams

    def create_model(self, doc_list):
        self.bigrams_phrases = Phrases(doc_list,
                                       min_count=self.min_count_bigrams)
        self.bigrams_phraser = Phraser(self.bigrams_phrases)
        self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                        min_count=self.min_count_trigrams)
        self.trigrams_phraser = Phraser(self.trigrams_phrases)
        self.bigrams_phraser.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASER_FILENAME)
        self.bigrams_phrases.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASES_FILENAME)

    def words_not_in_vocab(self, tok_doc, threshold):
        word_not_in_doc = set([
            x for x in tok_doc
            if self.trigrams_phrases.vocab[str.encode(x)] < threshold
        ])
        return word_not_in_doc
Пример #17
0
    def build_phraser(self, threshold: int=None):
        tokens = ReadThreads(
            self.board, self.input_dir, return_func=lambda x, y: y.split())
        bigram = Phrases(tokens, min_count=5, threshold=threshold)
        trigram = Phrases(bigram[tokens], threshold=threshold)

        bigram_mod = Phraser(bigram)
        trigram_mod = Phraser(trigram)

        filename = op.join(self.input_dir, f'{self.board}.bigrams')
        bigram_mod.save(filename)
        filename = op.join(self.input_dir, f'{self.board}.trigrams')
        trigram_mod.save(filename)

        return trigram_mod
Пример #18
0
def phrase_detect_train(sentances,min_count,threshold,common_terms,phrase_model_save_path = None):
    """
    input:
        sentances: tokenized sentances 
    """
    print('Transform sentances to trigrams .........\n')
    bi_phrases = Phrases(sentances, min_count=min_count, threshold=threshold,common_terms=common_terms)
    bigram_transformer = Phraser(bi_phrases)
    if phrase_model_save_path is not None:
        bi_phrases.save(phrase_model_save_path)
        bigram_transformer.save(phrase_model_save_path+'_transformer')

    sentances = list(bigram_transformer[sentances]) 
    ## if you want to check pharses list
    pharses_list = list(bigram_transformer.phrasegrams)
    print('Phrase model training done.')
    return sentances
Пример #19
0
def build_ngram_model(docs):
    bigram_model_path = Path('bigram_phraser.pkl')
    trigram_model_path = Path('trigram_phraser.pkl')
    if not bigram_model_path.exists() or not trigram_model_path.exists():
        print('Building n-gram models')
        bigram = Phrases(docs, min_count=3, threshold=6)
        trigram = Phrases(bigram[docs], min_count=3, threshold=6)

        bigram_model = Phraser(bigram)
        trigram_model = Phraser(trigram)

        bigram_model.save('bigram_phraser.pkl')
        trigram_model.save('trigram_phraser.pkl')
    else:
        print('Loading saved n-gram models')
        bigram_model = Phraser.load('bigram_phraser.pkl')
        trigram_model = Phraser.load('trigram_phraser.pkl')

    return (bigram_model, trigram_model)
Пример #20
0
    def get_phraser(self):
        """ Get trained phraser or train a new phraser to extract the phrases"""
        phraser_path = os.path.join(self.data_path, self.phraser_name)

        if os.path.isfile(phraser_path):
            return Phraser.load(phraser_path)

        challenge_req = self.get_challenge_req()
        sentences = [
            tokenize_str(req) for cha_id, req in challenge_req.itertuples()
        ]

        phrases = Phrases(sentences=sentences,
                          min_count=1,
                          threshold=0.2,
                          common_terms=TC_STOP_WORDS,
                          scoring='npmi')
        trained_phraser = Phraser(phrases)
        trained_phraser.save(phraser_path)

        return trained_phraser
Пример #21
0
def extract_phrases(df: pd.DataFrame):
    """
    Train bigram and trigram phrasers
    Input:
    - df: dataframe with column "text"
    """
    def wrapper(generator):
        for item in generator:
            yield item.text.split(" ")

    vocab = Counter()
    vocab_final = Counter()
    bigram_phrases = Phrases(wrapper(df.itertuples()),
                             min_count=5,
                             threshold=1)
    bigram = Phraser(bigram_phrases)
    trigram_phrases = Phrases(bigram[wrapper(df.itertuples())],
                              min_count=5,
                              threshold=1)
    trigram = Phraser(trigram_phrases)
    bigram.save("./vocab/bigram")
    trigram.save("./vocab/trigram")
Пример #22
0
def BuildPhraser(save_to_file=True,
                 model_file_name=os.getcwd() + "/models/" + "bigram_model.pkl",
                 min_count=10,
                 threshold=.7,
                 common_terms=STOPWORDS,
                 training_data=None):
    # Load training data.
    sentences = Text8Corpus(training_data)

    # Train bigram model.
    phrases = Phrases(sentences,
                      min_count=min_count,
                      threshold=threshold,
                      common_terms=common_terms)

    # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    bigram_model = Phraser(phrases)

    # save the model to file
    if save_to_file:
        bigram_model.save(fname_or_handle=model_file_name)
    return bigram_model
Пример #23
0
def build_phrase_model():

    phrase_list = load_phrases()

    phrases = Phrases(Corpus(CORPUS_FILE))
    bigrams = Phraser(phrases)

    bigrams.save(MODEL_FILE)

    years = Corpus(CORPUS_FILE).get_years()
    authors = Corpus(CORPUS_FILE).get_authors()

    with open(OUT_FILE, "w") as f:
        for i, line in tqdm(enumerate(bigrams[Corpus(CORPUS_FILE)])):

            line = remove_under(line)
            line = check_phrase_list(phrase_list, line)

            line = [authors[i]] + line
            line = [years[i]] + line

            f.write("{}\n".format(" ".join(remove_under(line))))
Пример #24
0
    def compute_bigram(self):
        '''
        Find and save bigrams living among the tweets

        :update: [covid_tweets].[token_tweets]
        '''
        print("Computing bigram.")
        cnxn = sqlite3.connect("covid_tweets.db")
        cursor = cnxn.cursor()

        count_query = '''
            SELECT count(tweet_id)
            FROM token_tweets
            WHERE date = ?'''

        cursor.execute(count_query, (self.date, ))
        num_tweets = cursor.fetchone()[0]
        print(self.date, num_tweets, "to have bigram computed.")

        query = '''
            SELECT tweet_id, tokenized_tweet
            FROM token_tweets
            WHERE date = ?'''

        cursor.execute(query, (self.date, ))
        results = cursor.fetchall()

        cnxn.close()

        retokenized_tweets = []
        for tweet_id, tokenized_tweet in results:
            tweet_tokens = tokenized_tweet.split(" ")
            retokenized_tweets.append(tweet_tokens)

        phrases = Phrases(retokenized_tweets, min_count=self.b_min)
        bigram = Phraser(phrases)

        bigram.save(f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl")
        print("Bigram computed.")
Пример #25
0
def main():
    sentence_stream = []
    start = timeit.default_timer()
    print('start the reuters')
    extract("/home/huicheng/Documents/datas/ReutersNews106521", sentence_stream)
    print(len(sentence_stream))
    print('start the bloombergs')
    #extract("/home/huicheng/Documents/datas/20061020_20131126_bloomberg_news", sentence_stream)
    print(len(sentence_stream))
    print('start ours')
    #new(sentence_stream)
    print('before:{}'.format(len(sentence_stream)))
    print(timeit.default_timer() - start)
    start = timeit.default_timer()
    sentence_stream = list(filter(None, sentence_stream))
    print('after:{}'.format(len(sentence_stream)))
    print(timeit.default_timer() - start)
    print('generating phrase and word2vec')
    start = timeit.default_timer()
    os.chdir("/home/huicheng/Documents/datas/")
    with open("sentence.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(sentence_stream)
    phrases = Phrases(sentence_stream,min_count=500, threshold=2)
    bigram = Phraser(phrases)
    # print(list(bigram[sentence_stream]))
    print(bigram['u', 's', 'wall', 'st', 'wall', 'street','s','p','500','s','p','xxx'])
    ##TODO phrase
    bigram.save("big_phrase.pickle")
    print('finish phrase time:{}'.format(timeit.default_timer() - start))
    print('start trigram')
    start = timeit.default_timer()
    phrases = Phrases(bigram[sentence_stream],min_count=500, threshold=2)
    trigram = Phraser(phrases)
    trigram.save("trig_phrase.pickle")
    print(trigram[bigram['u', 's', 'wall', 'st', 'wall', 'street','bank','of','america','s','p','500','s','p','xxx']])
    print('finish phrase time:{}'.format(timeit.default_timer() - start))
    '''##TODO word2vec
Пример #26
0
def save_models(dataset_name, num_topics):
    # load inputs and labels
    dataset = pd.read_csv("../cleaned/" + dataset_name +
                          "_stems.csv").astype(str).values.tolist()
    # remove placeholders from the stems dataset
    for index, sample in enumerate(dataset):
        dataset[index] = list(filter((" ").__ne__, sample))
    # create dic, copora and lda-model
    dic = gs.corpora.Dictionary(dataset)
    dic.save("../models/dictionary/" + dataset_name + "_dictionary")
    corpus = [dic.doc2bow(sample) for sample in dataset]
    lda_model = gensim.models.ldamulticore.LdaMulticore(
        corpus=corpus,
        id2word=dic,
        num_topics=num_topics,
        random_state=100,
        chunksize=100,
        passes=10,
        per_word_topics=True)  # update_every=1,
    lda_model.save("../models/topic_models/" + dataset_name + "_ldamodel")
    inputs = [" ".join(sentence) for sentence in dataset]
    vector_model = FastText(size=32, window=3, min_count=1)
    vector_model.build_vocab(inputs)
    vector_model.train(sentences=inputs,
                       total_examples=len(inputs),
                       total_words=vector_model.corpus_total_words,
                       epochs=10)
    vector_model.save("../models/word_embeddings/" + dataset_name +
                      "_fasttext")
    # make bigram model
    sentences = pd.read_csv("../cleaned/" + dataset_name +
                            "_clean.csv")["t"].tolist()
    tokenized = [t.split() for t in sentences]
    phrases = Phrases(tokenized)
    bigram = Phraser(phrases)
    bigram.save("../models/bigrams/bigram_" + dataset_name + ".pkl")
Пример #27
0
class GramFacade:
    def __init__(self,
                 model_dir,
                 bigrams_threshold=0.88,
                 trigrams_threshold=0.88):
        self.model_dir = model_dir
        self.bigrams_threshold = bigrams_threshold
        self.trigrams_threshold = trigrams_threshold

    def load_models(self):
        self.bigrams_phraser = Phraser.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser = Phraser.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASER_FILENAME)

    def load_phrases(self):
        self.bigrams_phrases = Phrases.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases = Phrases.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASES_FILENAME)

    def export_bigrams(self, docs):
        return [self.bigrams_phraser[doc] for doc in docs]

    def export_trigrams(self, bigrams):
        return [self.trigrams_phraser[bigram] for bigram in bigrams]

    def phrase(self, doc):
        bigrams = self.bigrams_phraser[doc]
        trigrams = self.trigrams_phraser[bigrams]
        return trigrams

    def create_model(self, doc_list):
        self.bigrams_phrases = Phrases(doc_list,
                                       scoring='npmi',
                                       threshold=self.bigrams_threshold)
        self.bigrams_phraser = Phraser(self.bigrams_phrases)
        self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                        scoring='npmi',
                                        threshold=self.trigrams_threshold)
        self.trigrams_phraser = Phraser(self.trigrams_phrases)
        self.bigrams_phraser.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASER_FILENAME)
        self.bigrams_phrases.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASES_FILENAME)

    def words_not_in_vocab(self, tok_doc, threshold):
        word_not_in_doc = set([
            x for x in tok_doc
            if self.trigrams_phrases.vocab[str.encode(x)] < threshold
        ])
        return word_not_in_doc

    def retrieve_grams(self):
        pgrams = self.trigrams_phraser.phrasegrams
        gram_list = []
        for word, values in pgrams.items():
            gram = b'_'.join(word)
            count, score = values[0], values[1]
            gram_list.append({
                "gram": gram.decode("utf-8"),
                "count": count,
                "score": score
            })
        gram_sorted = sorted(gram_list, key=lambda x: x["score"], reverse=True)
        return gram_sorted
Пример #28
0
data = file.readlines()

#prepare data for phraser
sentence_stream = [line.split(" ") for line in data]

bigram = Phrases(sentence_stream, min_count=3, threshold=5, delimiter=b'%%')
trigram = Phrases(bigram[sentence_stream],  min_count=3, threshold=5, delimiter=b'%%')
bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)

# want to see the phrases use:
#print(trigram_phraser.phrasegrams.items())

bigram.save("phrase_bigram.model")
trigram.save("phrase_trigram.model")
bigram_phraser.save("phraser_bigram.model")
trigram_phraser.save("phraser_trigram.model")



#Testing Sentences...
sent1 = [u'der', u'Dialog', u'im', u'Dunkeln', u'in', u'Hamburg']
sent2 = [u'Samstag', u'ist', u'die', u'Lange', u'Nacht', u'der', u'Museen']
sent3 = [u'Sonderaustellung', u'Archäologische', u'Museum', u'Hamburg', u'lustig']
sent4 = [u'Early', u'Bird', u'Ticket', u'Hamburg']
sent5 = [u'Hafen', u'City', u'Hamburg']
sent6 = [u'FC', u'St', u'Pauli', u'spielt']
sent7 = [u'das', u'Tor', u'zur', u'Welt']
sent8 = [u'Museum', u'für', u'Hamburger', u'Geschichte']
sent9 = [u'Besuch', u'im', u'Alten', u'Elbtunnel']
class Builder(object):
    def __init__(self,
                 ndocs,
                 phrase_min_count=5,
                 vocabulary_size=10000,
                 bigram_min_count=5,
                 bigram_threshold=10,
                 trigram_min_count=5,
                 trigram_threshold=10,
                 substitutions=dict(),
                 data_directory='./data',
                 model_directory='./model'):
        self.ndocs = ndocs
        self.phrase_min_count = phrase_min_count
        self.vocabulary_size = vocabulary_size
        self.bigram_min_count = bigram_min_count
        self.bigram_threshold = bigram_threshold
        self.trigram_min_count = trigram_min_count
        self.trigram_threshold = trigram_threshold
        self.substitutions = substitutions
        self.data_directory = data_directory
        self.model_directory = model_directory
        self.load_bad_phrases()

    def tokenize(self, text):
        return [token.lower() for token in word_tokenize(text)]

    def stream_sentences(self, texts, description="Streaming sentences ..."):
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description(description)
            for text in pbar:
                for sentence in sent_tokenize(text):
                    yield self.tokenize(sentence)

    def load_bad_phrases(self):
        with open("%s/bad-phrases.txt" % self.data_directory,
                  mode='r',
                  encoding='UTF-8') as fp:
            self.bad_phrases = set(
                [phrase.strip() for phrase in fp.readlines()])

    def add_bad_phrase(self, phrase):
        self.bad_phrases.add(phrase)

    def save_bad_phrases(self):
        bad_phrases = list(self.bad_phrases)
        bad_phrases.sort()
        with open("%s/bad-phrases.txt" % self.data_directory,
                  mode='w',
                  encoding='UTF-8') as fp:
            for phrase in bad_phrases:
                fp.write("%s\n" % phrase)

    def train_phrasers(self, texts):
        bigrams = Phrases(self.stream_sentences(
            texts, description="Streaming text for bigram phraser  ..."),
                          min_count=self.bigram_min_count,
                          threshold=self.bigram_threshold)
        #print("Training bigram phraser ...")
        self.bigram_phraser = Phraser(bigrams)

        #print("Collecting trigrams ...")
        trigrams = Phrases(self.bigram_phraser[self.stream_sentences(
            texts, description="Streaming text for trigram phraser ...")],
                           min_count=self.trigram_min_count,
                           threshold=self.trigram_threshold)
        #print("Training trigram phraser ...")
        self.trigram_phraser = Phraser(trigrams)

    def save_phrasers(self):
        path = os.path.join(self.model_directory, "bigram-phraser.pkl")
        self.bigram_phraser.save(path)

        path = os.path.join(self.model_directory, "trigram-phraser.pkl")
        self.trigram_phraser.save(path)

    def load_phrasers(self):
        path = os.path.join(self.model_directory, "bigram-phraser.pkl")
        self.bigram_phraser = Phraser.load(path)

        path = os.path.join(self.model_directory, "trigram-phraser.pkl")
        self.trigram_phraser = Phraser.load(path)

    def prepare_text(self, text):
        for key, value in self.substitutions.items():
            text = text.replace(key, value)
        tokens = self.tokenize(text)
        tokens = self.bigram_phraser[tokens]
        tokens = self.trigram_phraser[tokens]
        return [token for token in tokens if not token in self.bad_phrases]

    def prepare_texts(self, texts):
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description("Preparing texts ...")
            prepared_texts = [self.prepare_text(text) for text in pbar]
        return prepared_texts

    def keep_phrase(self, phrase, cnt):
        if "'" in phrase: return False
        for c in PUNCTUATION:
            if c in phrase: return False
        if phrase in self.bad_phrases: return False
        phrase_set = set(phrase)
        if SYMBOLS & phrase_set: return False
        if (LETTERS & set(phrase)) and cnt > self.phrase_min_count: return True
        return False

    def build_vocabulary(self, texts, save=False):
        self.ndocs = len(texts)
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description("Building vocabulary over %d documents." %
                                 self.ndocs)
            phrase_map = {}
            for document in pbar:
                for phrase in document:
                    if not phrase in phrase_map: phrase_map[phrase] = 0
                    phrase_map[phrase] += 1
        phrases = list(phrase_map.keys())
        phrases = sorted(phrases, key=lambda phrase: -phrase_map[phrase])

        vocabulary = [
            phrase for phrase in phrases
            if self.keep_phrase(phrase, phrase_map[phrase])
        ]

        hyphenated = {
            phrase.replace('-', '_')
            for phrase in vocabulary if "-" in phrase
        }
        vocabulary = [
            phrase for phrase in vocabulary if not phrase in hyphenated
        ][:self.vocabulary_size]
        if save:
            path = os.path.join(
                self.data_directory, "vocabulary-%d-%d-%d.tsv" %
                (len(texts), self.phrase_min_count, self.vocabulary_size))
            fp = open(path, mode='w', encoding='UTF-8')
            for phrase in vocabulary:
                fp.write("%s\t%d\n" % (phrase, phrase_map[phrase]))
            fp.close()
        self.vocabulary = set(vocabulary)

    def load_vocabulary(self):
        path = os.path.join(
            self.data_directory, "vocabulary-%d-%d-%d.tsv" %
            (self.ndocs, self.phrase_min_count, self.vocabulary_size))
        fp = open(path, mode='r', encoding='UTF-8')
        self.vocabulary = set([])
        for line in fp:
            line = line.strip()
            if line:
                phrase, cnt = line.split('\t')
                self.vocabulary.add(phrase)
        fp.close()

    def build_document(self, text):
        return [phrase for phrase in text if phrase in self.vocabulary]

    def build_corpus(self, texts):
        with tqdm.tqdm(texts) as pbar:
            pbar.set_description("Building corpus ...")
            corpus = [self.build_document(text) for text in pbar]
        return corpus

    def build_dictionary(self, corpus, save=False):
        self.dictionary = Dictionary(corpus)
        self.dictionary.filter_extremes(no_below=self.phrase_min_count,
                                        no_above=0.6,
                                        keep_n=self.vocabulary_size)
        if save: self.save_dictionary()

    def save_dictionary(self, path=None):
        if path is None:
            path = os.path.join(self.model_directory, "dictionary.pkl")
        self.dictionary.save(path)

    def load_dictionary(self, path=None):
        if path is None:
            path = os.path.join(self.model_directory, "dictionary.pkl")
        self.dictionary = Dictionary.load(path)

    def encode_corpus(self, corpus):
        return [self.dictionary.doc2bow(document) for document in corpus]
Пример #30
0
def nlp_preprocess(filepath_dict: dict,
                   col: str,
                   df=None,
                   verbose: bool = True,
                   overwrite_interim: bool = True) -> pd.DataFrame:
    def clean_doc(corpus):
        '''
        generator function to read in docs from the file,
        and substitute and remove substrings
        '''
        for doc in corpus:
            yield au_tu.remove_substrings(au_tu.clean_tokens(
                doc,
                tokens=to_replace_dict,
                whole_words_only=whole_words_only,
                ignore_case=ignore_case,
            ),
                                          to_remove_list=to_remove_list,
                                          whole_words_only=whole_words_only,
                                          ignore_case=ignore_case)

    def tokenize_entities(parsed_doc):
        txt = parsed_doc.text
        for ent in parsed_doc.ents:
            txt = txt[:ent.start_char] + ent.text.replace(
                ' ', '_') + txt[ent.end_char:]
        return txt

    def cleaned_doc_corpus(corpus):
        '''
        generator function to use spaCy to parse docs, clean docs,
        tokenize named entities, and yield documents
        '''
        for parsed_doc in nlp.pipe(clean_doc(corpus),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):
            yield tokenize_entities(parsed_doc)

    def punct_space_more(token):
        '''
        helper function to eliminate tokens that are
        pure punctuation or whitespace or digits or only 1 character
        '''
        return (
            token.is_punct or token.is_space or token.is_digit
            or token.text == "'s" or token.lemma_ == '-PRON-' or
            # token.lemma_ == 'say' or
            # token.lemma_ == 'tell' or
            # token.lemma_ == 'be' or
            len(token.text) <= 1)

    def line_doc(filename):
        '''
        generator function to read in docs from the file,
        un-escape the original line breaks in the text,
        and do additional cleaning
        '''
        def hyp_to_us(doc):
            return re.sub(r'\b-\b', '_', doc)

        def remove_punct(doc):
            # keep: alphanumberic (w), spaces (s), single quote, underscore
            return re.sub(r'[^\w\s\'_]+', '', doc)

        # with codecs.open(filename, encoding='utf_8') as f:
        with smart_open(filename) as f:
            for doc in f:
                yield remove_punct(hyp_to_us(doc.decode())).replace(
                    '\\n', '\n')

    def lemmatized_sentence_corpus(filename):
        '''
        generator function to use spaCy to parse docs,
        lemmatize the text, and yield sentences
        '''
        for parsed_doc in nlp.pipe(line_doc(filename),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            for sent in parsed_doc.sents:
                yield ' '.join([
                    token.lemma_ for token in sent
                    if not punct_space_more(token)
                ])

    if verbose:
        logger.info(f'Working on text from: {col}')

    # # debug - only getting from the sample dataframe here
    # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy()

    df_phrased = df.loc[df[col].notnull(),
                        ['tfa_master_uid', 'app_year', col]].copy()

    nlp = spacy.load('en', disable=[])

    # clean text and tokenize entities
    if verbose:
        logger.info('Cleaning docs...')
    df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values))
    # remove 'the_' from NER tokens
    df_phrased[col] = df_phrased[col].apply(
        lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()]))
    if verbose:
        logger.info('\tDone.')

    # create & open a new file in write mode
    if verbose:
        logger.info('Saving documents, one per line...')
    doc_count = 0
    with codecs.open(filepath_dict['doc_txt_filepath'], 'w',
                     encoding='utf_8') as doc_txt_file:
        for doc in df_phrased[[col]].apply(lambda x: ' '.join(x),
                                           axis=1).tolist():
            # write the doc as a line in the new file
            # escape newline characters in the original doc text
            doc_txt_file.write(doc.replace('\n', '\\n') + '\n')
            doc_count += 1
    if verbose:
        logger.info(
            f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}"
        )

    nlp = spacy.load('en', disable=['ner'])

    # lemmatize and save sentences

    if overwrite_interim:
        if verbose:
            logger.info(
                f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}"
            )
        # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f:
        with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f:
            for sentence in lemmatized_sentence_corpus(
                    filepath_dict['doc_txt_filepath']):
                f.write(sentence + '\n')
            if verbose:
                logger.info('Done.')
        unigram_sentences = LineSentence(
            filepath_dict['unigram_sentences_filepath'])

        if verbose:
            logger.info('Unigram examples:')
            for unigram_sentence in it.islice(unigram_sentences, 10, 20):
                logger.info(u' '.join(unigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding bigram phrases')
        # create the bigram model
        bigram = Phrases(unigram_sentences,
                         min_count=phrase_min_count,
                         threshold=phrase_threshold,
                         max_vocab_size=phrase_max_vocab_size,
                         progress_per=phrase_progress_per,
                         scoring=phrase_scoring,
                         common_terms=phrase_common_terms)
        bigram_model = Phraser(bigram)
        bigram_model.save(filepath_dict['bigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}"
            )
        # save bigram sentences
        with codecs.open(filepath_dict['bigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for unigram_sentence in unigram_sentences:
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                f.write(bigram_sentence + '\n')

        bigram_sentences = LineSentence(
            filepath_dict['bigram_sentences_filepath'])
        if verbose:
            logger.info('Bigram examples:')
            for bigram_sentence in it.islice(bigram_sentences, 10, 20):
                logger.info(u' '.join(bigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding trigram phrases')
        # create the trigram model
        trigram = Phrases(bigram_sentences,
                          min_count=phrase_min_count,
                          threshold=phrase_threshold,
                          max_vocab_size=phrase_max_vocab_size,
                          progress_per=phrase_progress_per,
                          scoring=phrase_scoring,
                          common_terms=phrase_common_terms)
        trigram_model = Phraser(trigram)
        trigram_model.save(filepath_dict['trigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}"
            )
        # create trigram sentences
        with codecs.open(filepath_dict['trigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')

        trigram_sentences = LineSentence(
            filepath_dict['trigram_sentences_filepath'])
        if verbose:
            logger.info('Trigram examples:')
            for trigram_sentence in it.islice(trigram_sentences, 10, 20):
                logger.info(u' '.join(trigram_sentence))
                logger.info('=' * 30)

    if verbose:
        logger.info(
            f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}"
        )
    # using saved models, write transformed text out to a new file, one doc per line
    with codecs.open(filepath_dict['trigram_docs_filepath'],
                     'w',
                     encoding='utf_8') as f:
        for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            # removing punctuation and whitespace
            unigram_doc = [
                token.lemma_ for token in parsed_doc
                if not punct_space_more(token)
            ]

            # apply the first-order and second-order phrase models
            bigram_doc = bigram_model[unigram_doc]
            trigram_doc = trigram_model[bigram_doc]

            # remove any remaining stopwords
            trigram_doc = [
                term for term in trigram_doc
                if term not in nlp.Defaults.stop_words
            ]

            #extend the stop workds
            stop_words_extend = [
                'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say',
                'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done',
                'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want',
                'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also',
                'may', 'take', 'come'
            ]
            trigram_doc = [
                term for term in trigram_doc if term not in stop_words_extended
            ]

            # write the transformed doc as a line in the new file
            trigram_doc = ' '.join(trigram_doc)
            f.write(trigram_doc + '\n')
    if verbose:
        logger.info('Done.')

    # put the text back in the dataframe
    trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath'])

    if len([doc for doc in trigram_docs]) == df_phrased.shape[0]:
        for i, doc in enumerate(trigram_docs):
            df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc)
    else:
        raise ValueError(
            'Different number of processed and original documents')

    # save dataframe
    if verbose:
        logger.info('Saving NLP processed data: {}'.format(
            filepath_dict['filepath_out']))
    df_phrased.to_csv(filepath_dict['filepath_out'])

    return df_phrased
Пример #31
0
               ['high_fat', 'intake'], ['type_of', 'care'],
               ['population_based', 'SNP'], ['anal_gland', 'neoplasms'],
               ['acute_myelocytic', 'leukemia'],
               ['Samson_Gardner', 'syndrome'],
               ['colon_mucinous', 'adenocarcinoma']]

phrases = Phrases(contents,
                  threshold=0.25,
                  scoring="npmi",
                  custom_bigrams=my_bigrams)
bigram = Phraser(phrases)
tri_phase = Phrases(bigram[contents],
                    custom_bigrams=my_trigrams,
                    threshold=0.25,
                    scoring="npmi")
trigram = Phraser(tri_phase)

sent = [
    u'red', u'shift', u'square', u'pants', u'bit', u'parts', u'transverse',
    u'colon', u'cancer', u'trans', u'atlantic', u'ocean'
]
print(bigram[sent])

print trigram[bigram[sent]]

#print item
bigram.save('./preprocessed_big_phrases')
print "ngrams saved"
trigram.save('./preprocessed_trigram_phrases')
print "ngrams saved"
Пример #32
0
#The output file name
modelFile = "Feb2017FullCorpus300D"

#Original corpus available at: http://files.pushshift.io/reddit/comments/ (RC_2017-02.BZ2)
#Retreieves the corpus file. This corpus was generated using the output from the upgradedCleaner.py file
sentences = MySentences("Data\\Feb2017.txt")

#Creates a Phrases object from the corpus
myPhrases = Phrases(sentences, min_count=20)

#Creates a much smaller Phraser object from the Phrases object
bigram_transformer = Phraser(myPhrases)

#Saves it so you don't have to redo this every time.
bigram_transformer.save("Feb2017BigramTransformer")
bigram_transformer = Phraser.load("Feb2017BigramTransformer")

#Create and save the actual model
model = Word2Vec(PhrasingIterable(bigram_transformer, "Data\\Feb2017.txt"),
                 min_count=15,
                 workers=4,
                 size=300,
                 window=8)
model.save('Models\\' + modelFile)
model = Word2Vec.load('Models\\' + modelFile)

# Accuracy tests
model.accuracy('questions-words.txt')
testingSuite(modelFile)
Пример #33
0
def load_data(data_folder, use_old_models):

    try:
        1 / use_old_models  # if use_old_models=0, then this fails

        clean_data = pd.read_csv(os.path.join(data_folder, "clean_data.csv"),
                                 encoding="ISO-8859-1")

        nlp_dict = corpora.Dictionary.load(
            os.path.join(data_folder, 'nlp_dict.dict'))
        processed_texts = np.load(
            os.path.join(data_folder, "processed_texts.npy")).tolist()

        print("loaded preprocessed df")

        bigram = Phraser.load(os.path.join(data_folder, 'bigram'))

    except:
        print("new preprocessing")

        cols_to_use = [
            'age', 'body_type', 'diet', 'drinks', 'drugs', 'education',
            'essay0', 'essay1', 'essay2', 'essay5', 'essay6', 'essay7',
            'ethnicity', 'income', 'job', 'orientation', 'pets', 'religion',
            'smokes', 'speaks', 'status'
        ]

        data = pd.read_csv(os.path.join(data_folder, "profiles.csv"),
                           usecols=cols_to_use)

        data.columns = [
            'age', 'text_body_type', 'text_diet', 'text_drinks', 'text_drugs',
            'text_education', 'text_self_sum', 'text_life', 'text_goodat',
            'text_6things', 'text_thinking', 'text_friday', 'text_ethnicity',
            'income', 'text_job', 'text_orientation', 'text_pets',
            'text_religion', 'text_smokes', 'text_speaks', 'text_status'
        ]

        # cleaning diet
        data["text_diet"] = data.text_diet.str.replace(
            "strictly ", "").str.replace("mostly ",
                                         "").str.replace("other", "anything")
        data["text_diet"] = data.text_diet.replace(np.nan, "anything")

        # cleaning body type
        data["text_body_type"] = data.text_diet.replace(np.nan, "average")

        # cleaning drinks
        data["text_drinks"] = data.text_drinks.replace(np.nan, "socially")

        # cleaning drugs
        data["text_drugs"] = data.text_drugs.replace(np.nan, "never")

        # cleaning education

        data["text_education"] = data.text_education.replace(
            np.nan, "high school")

        data.loc[data.text_education.str.contains("space"),
                 "text_education"] = "high school"

        searchfor = ['university', 'college']
        data.loc[data.text_education.str.contains('|'.join(searchfor)),
                 "text_education"] = 'bachelor'

        searchfor = ['masters', 'law', 'med']
        data.loc[data.text_education.str.contains('|'.join(searchfor)),
                 "text_education"] = 'masters'

        data.loc[data.text_education.str.contains('ph.d'),
                 "text_education"] = 'ph.d'
        data.loc[data.text_education.str.contains('high school'),
                 "text_education"] = 'high school'

        clean_data = data

        columns_with_text = [
            each_text_col for each_text_col in clean_data.columns.tolist()
            if "text" in each_text_col
        ]

        for each_text_col in columns_with_text:
            clean_data[each_text_col] = clean_data[each_text_col].replace(
                np.nan, "")
            clean_data[each_text_col].apply(str)

        clean_data['all_texts'] = clean_data[columns_with_text].apply(
            lambda x: ' / '.join(x), axis=1)

        clean_data = clean_data[(clean_data["all_texts"].str.len() < 21000) &
                                (clean_data["all_texts"].str.len() >
                                 860)]  # want text of a minimum size

        clean_data.to_csv(os.path.join(data_folder, "clean_data.csv"))

        # train the bigram
        bigram = Phraser(
            Phrases(text_tokenize_gen(clean_data.all_texts.values.tolist())))

        processed_texts = [[
            text for text in my_lemmatize(texts)
        ] for texts in bigram[text_tokenize_gen(clean_data.all_texts.values)]]

        np.save(os.path.join(data_folder, "processed_texts"), processed_texts)

        bigram.save(os.path.join(data_folder, 'bigram'))
        nlp_dict = corpora.Dictionary(processed_texts)
        # in case you want to filter out some words
        nlp_dict.filter_extremes(no_below=0.1, no_above=0.4)
        # store the dictionary, for future reference
        nlp_dict.save(os.path.join(data_folder, 'nlp_dict.dict'))
        nlp_dict = nlp_dict.load(os.path.join(data_folder, 'nlp_dict.dict'))

        bigram["high school".split()]  # at least I know it works

    return nlp_dict, bigram, clean_data, processed_texts