Пример #1
0
    def get_trigrams(self):
        """
        Builds unigram, bigram, and trigram models respectively.
        Writes the text of each model to a seperate file.

        """
        unigram_sentences = LineSentence(self.unigram_sentences_filepath)
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(self.bigram_model_filepath)
        bigram_model = Phrases.load(self.bigram_model_filepath)
        with open(self.bigram_sentences_filepath, 'w', encoding="utf-8") as f:
            for unigram_sentence in unigram_sentences:
                bigram_sent = " ".join(bigram_model[unigram_sentence])  # a bit confused by this.
                f.write(bigram_sent)
        bigram_sentences = LineSentence(self.bigram_sentences_filepath)
        trigram_model = Phrases(bigram_sentences)
        trigram_model.save(self.trigram_model_filepath)
        trigram_model = Phrases.load(self.trigram_model_filepath)
        with open(self.trigram_sentences_filepath, 'w', encoding="utf-8") as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = " ".join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')
        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        with open(self.trigram_articles_filepath, 'w', encoding="utf-8") as f:
            for parsed_article in self.line_article("../data/article_texts"):
                unigram_article = [token.lemma_ for token in self.nlp(parsed_article)
                                   if not self.punct_space(token)]
                bigram_article = bigram_model[unigram_article]
                trigram_article = trigram_model[bigram_article]
                trigram_article = [term for term in trigram_article
                                    if term not in STOP_WORDS]
                trigram_article = " ".join(trigram_article)
                f.write(trigram_article + '\n')
 def train_with_trigrams(self):
     trigram_model = Phrases.load(self.trigram_model_filepath)
     bigram_model = Phrases.load(self.bigram_model_filepath)
     for doc, id in self.es_docs():
         unigrams = text_cleaner.clean_tokens(doc)
         bigrams = bigram_model[unigrams]
         trigrams = trigram_model[bigrams]
         trigrams = text_cleaner.filter_terms(trigrams)
         td = TaggedDocument(trigrams, [id])
         self.taggeddoc.append(td)
     print('Data Loading finished')
     print(len(self.taggeddoc), type(self.taggeddoc))
     model = gensim.models.Doc2Vec(self.taggeddoc,
                                   dm=0,
                                   iter=1,
                                   window=15,
                                   seed=1337,
                                   min_count=5,
                                   workers=4,
                                   alpha=0.025,
                                   size=200,
                                   min_alpha=0.025)
     for epoch in range(200):
         if epoch % 20 == 0:
             print('Now training epoch %s' % epoch)
         model.train(self.taggeddoc,
                     total_examples=model.corpus_count,
                     epochs=model.iter)
         model.alpha -= 0.002  # decrease the learning rate
         model.min_alpha = model.alpha  # fix the learning rate, no decay
     model.save(self.model_file)
     model.save_word2vec_format(self.model_file + '.word2vec')
Пример #3
0
def getTopics(jobs_):
  
    bigram_model = Phrases.load('data/bigram_model_all')
    trigram_model = Phrases.load('data/trigram_model_all')
    trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict')
    lda = LdaMulticore.load('data/lda_model_all')

    topic_names = {0:u'Risk Management Bank', 
                   1:u'Big Data Report', 
                   2:u'Automotive SAP', 
                   3:u'Microsoft Java Scrum', 
                   4:u'Medical Consultant', 
                   5:u'Java Engineer', 
                   6:u'Computer Vision Developer', 
                   7:u'Data Analyst', 
                   8:u'BI SAP BW', 
                   9:u'IOT Reporting R', 
                   10:u'Global Project Presentation',
                   11:u'Cloud Engineer IOT', 
                   12:u'Industry 4.0', 
                   13:u'Risk Consulting', 
                   14:u'Machine Learning Data Science'}
    
    topics_ = []
    
    for job_ in jobs_:
      if job_ is not None:
        #print(job_[0])
        topics_.append(lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, job_[1], job_[0]))
Пример #4
0
def get_test_reviews():
    doc_reviews = {}
    sent_reivews = {}
    num_docs = 0
    num_words = 0
    apk_path = os.path.join("..", "data", "raw")
    apk_lst_path = os.path.join(apk_path, "package_names.txt")
    # load phrases
    bigram = Phrases.load(os.path.join("..", "model", "bigram.model"))
    trigram = Phrases.load(os.path.join("..", "model", "trigram.model"))
    with open(apk_lst_path) as fin:
        apk_lst = [apk_name.strip() for apk_name in fin.readlines()]
    for apk_name in apk_lst:
        file = os.path.join(apk_path, "mongodb", apk_name, "review.txt")
        with open(file) as fin:
            reviews_sent = []
            reviews_doc = []
            for line in fin.readlines():
                words_sents, wc = extractSentenceWords(line)
                reviews_sent.append(words_sents)
                reviews_doc.append(list(itertools.chain.from_iterable(words_sents)))
                num_docs += 1
                num_words += wc
            sent_reivews[apk_name] = trigram[bigram[reviews_sent]]
            doc_reviews[apk_name] = trigram[bigram[reviews_doc]]

    logging.info("Read %d docs, %d words!" % (num_docs, num_words))
    return sent_reivews, doc_reviews
Пример #5
0
def load_model():
    bigram = Phrases.load(os.path.join("..", "model", "bigram.model"))
    trigram = Phrases.load(os.path.join("..", "model", "trigram.model"))
    wv_model = Word2Vec.load(
        os.path.join("..", "model", "appreviews_word2vec.model"))
    logging.info("Load word2vec model finished")
    return bigram, trigram, wv_model
 def save_sentences_trigram(self):
     f = open(self.trigram_sentences_filepath, 'w')
     trigram_model = Phrases.load(self.trigram_model_filepath)
     bigram_model = Phrases.load(self.bigram_model_filepath)
     for doc, id in self.es_docs():
         unigrams = text_cleaner.clean_tokens(doc)
         bigrams = bigram_model[unigrams]
         trigrams = trigram_model[bigrams]
         trigrams = text_cleaner.filter_terms(trigrams)
         trigrams = u' '.join(trigrams)
         f.write(trigrams + '\n')
Пример #7
0
def trigrams(corpus, output_prefix):
    print("----- Trigrams -----")
    if os.path.exists(output_prefix + "_trigram_phrases"):
        trigram_phrases = Phrases.load(output_prefix + "_trigram_phrases")
        print("Loaded trigram phrases")
    else:
        bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"])
        trigram_phrases = Phrases(bigram_phrases[corpus], min_count=CONFIG["trigram_phrase_min_count"], threshold=CONFIG["trigram_phrase_threshold"], delimiter=CONFIG["trigram_phrase_delimiter"])
        trigram_phrases.save(output_prefix + "_trigram_phrases")
    trigram_transformer = Phraser(trigram_phrases)
    dct = Dictionary(trigram_transformer[corpus])
    dct.save(output_prefix + "_dictionary_trigram")
    print("Training tf-idf from trigrams")
    bow_corpus = [dct.doc2bow(line) for line in trigram_transformer[corpus]]
    tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc')
    tfidf.save(output_prefix + "_tfidf_trigram")
    print("Training word2vec model with trigram")
    start_time = time()
    trigram_model = gensim.models.Word2Vec(trigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'],
                                   min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'],
                                   negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'],
                                   iter=CONFIG['train_epoch'])
    trigram_model.save(output_prefix + "_trigram")
    print("Time :", format_time(time() - start_time))
    return trigram_model
Пример #8
0
    def __getitem__(self, word):
        global _phrases

        # If a phrases model is already loaded, just use that
        if _phrases is not None:
            self.conn = None

        # Otherwise, try to connect to the separate process.
        # Fall back to loading the phrase model here
        elif not hasattr(self, 'conn'):
            try:
                print('Connecting to phrases process...')
                address = ('localhost', 6001)
                self.conn = Client(address, authkey=b'password')
                print('Done connecting to phrases')

            except ConnectionRefusedError:
                self.conn = None
                print('Could not connect to phrases process,')
                print('Loading phrases model...')
                _phrases = Phrases.load('data/bigram_model.phrases')
                print('Done loading phrases')

        if self.conn is not None:
            self.conn.send(word)
            return self.conn.recv()
        else:
            return _phrases[word]
def main():
    # -------------------------------------------------------------------------------
    # Parameters

    # the script will most likely work if we swap the TEXTS variable
    # with any iterable of text (where one element represents a document,
    # and the whole iterable is the corpus)
    newsgroups_train = fetch_20newsgroups(subset='train')
    TEXTS = newsgroups_train.data

    # spacy's english model for text preprocessing
    NLP = spacy.load('en')

    # a set of stopwords built-in to spacy, we can always
    # expand this set for the problem that we are working on,
    # here we include python built-in string punctuation mark
    STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(
        ENGLISH_STOP_WORDS)

    # create a directory called 'model' to store all outputs in later section
    MODEL_DIR = 'model'
    UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
    PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
    BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
    WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')

    # -------------------------------------------------------------------------------
    logger.info('job started')
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if not os.path.exists(UNIGRAM_PATH):
        logger.info('preprocessing text')
        export_unigrams(UNIGRAM_PATH,
                        texts=TEXTS,
                        parser=NLP,
                        stopwords=STOPWORDS)

    if os.path.exists(PHRASE_MODEL_CHECKPOINT):
        phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
    else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        sentences = LineSentence(BIGRAM_PATH)
        word2vec = Word2Vec(sentences, workers=cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
def extractFeaturesW2V(
        w2vmodel="skip_nostop_multi_300features_10minwords_10context",
        phrasemodel="phrase.model",
        useDev=False):

    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(
            tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(
            tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILETEST, 'windows-1252', 2)

    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodel)
    features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel,
                                                tweets_train, targets_train,
                                                labels_train)
    features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev,
                                              targets_dev, labels_dev)

    return features_train_w2v, labels_train, features_dev_w2v, labels_dev
def generate_bow(corpus_filename, category, use_bigrams, no_above, no_below):
    if not os.path.exists('./data/%s' % category):
        os.makedirs('./data/%s' % category)

    tokens = [
        utils.tokenize(line)
        for line, label in zip(open('./data/%s.csv' % corpus_filename),
                               open('./data/corpus-labels.csv'))
        if category in label
    ]
    print 'First token', tokens[1]

    category_filename = corpus_filename.replace('corpus', 'category')

    #Each category gets its own dictionary and its own corpus, but uses the same bigram model
    #that was computed on all the abstracts
    if use_bigrams:
        if not os.path.exists('./data/%s/bigram.bin' % category):
            bigram = Phrases(
                utils.tokenize(line)
                for line, label in zip(open('./data/%s.csv' % corpus_filename),
                                       open('./data/corpus-labels.csv'))
                if category in label)
            Phrases.save(bigram, './data/%s/bigram.bin' % category)
        else:
            bigram = Phrases.load('./data/%s/bigram.bin')

        tokens = [bigram[token] for token in tokens]
        print 'First bigram token', tokens[1]

    #Make the dictionary, a collection of statistics about all tokens in the corpus
    #This is the mapping from words to their id's. It's the lookup table for features.
    dictionary = corpora.Dictionary(tokens)

    # words that appear only once
    dictionary.filter_extremes(
        no_above, no_below)  #no_above=0.05, no_below=10 yielded good results
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()

    # store the dictionary, for future reference
    dictionary.save('./data/%s/%s.dict' % (category, category_filename))

    # memory-friendly bag-of-words class
    class BOW(object):
        def __iter__(self):
            for line, label in zip(open('./data/%s.csv' % corpus_filename),
                                   open('./data/corpus-labels.csv')):
                # assume there's one document per line, tokens separated by whitespace
                if category in label:
                    yield dictionary.doc2bow(utils.tokenize(line))
                else:
                    pass

    # Now we can make a bag of words and do something with it by iterating over it
    arxiv_bow = BOW()
    corpora.MmCorpus.serialize('./data/%s/%s.mm' %
                               (category, category_filename),
                               arxiv_bow)  # store to disk, for later use
Пример #12
0
def create_bigram(unigram_resume):
    bigram_model = Phrases.load(
        os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', 'www',
                         'Parsing', 'bigram_model')))
    #bigram_model.add_vocab(unigram_resume)
    bigram_resume = [bigram_model[sentence] for sentence in unigram_resume]
    return bigram_resume
Пример #13
0
def create_trigram(bigram_resume):
    trigram_model = Phrases.load(
        os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', 'www',
                         'Parsing', 'trigram_model')))
    #trigram_model.add_vocab(bigram_resume)
    trigram_resume = [trigram_model[sentence] for sentence in bigram_resume]
    return trigram_resume
Пример #14
0
def get_bigram_model():
    model_exists = os.path.exists(bigram_model_filepath)
    if model_exists:
        bigram_model = Phrases.load(bigram_model_filepath)
    else:
        unigram_sentences = get_unigram_sentences()
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(bigram_model_filepath)
    return bigram_model
Пример #15
0
def main():
    # -------------------------------------------------------------------------------
    # Parameters

    # the script will most likely work if we swap the TEXTS variable
    # with any iterable of text (where one element represents a document,
    # and the whole iterable is the corpus)
    newsgroups_train = fetch_20newsgroups(subset = 'train')
    TEXTS = newsgroups_train.data

    # spacy's english model for text preprocessing
    NLP = spacy.load('en')

    # a set of stopwords built-in to spacy, we can always
    # expand this set for the problem that we are working on,
    # here we include python built-in string punctuation mark
    STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(ENGLISH_STOP_WORDS)

    # create a directory called 'model' to store all outputs in later section
    MODEL_DIR = 'model'
    UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
    PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
    BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
    WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')

    # -------------------------------------------------------------------------------
    logger.info('job started')
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if not os.path.exists(UNIGRAM_PATH):
        logger.info('preprocessing text')
        export_unigrams(UNIGRAM_PATH, texts = TEXTS, parser = NLP, stopwords = STOPWORDS)

    if os.path.exists(PHRASE_MODEL_CHECKPOINT):
        phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
    else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        sentences = LineSentence(BIGRAM_PATH)
        word2vec = Word2Vec(sentences, workers = cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
Пример #16
0
def train_phrase_model(cleaned_filepath, bigram_model_filepath,
                       run_or_load_flag):

    if run_or_load_flag:
        unigram_sentences = LineSentence(cleaned_filepath)
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(bigram_model_filepath)
    else:
        bigram_model = Phrases.load(bigram_model_filepath)

    return bigram_model
Пример #17
0
def extractFeaturesW2V(trainingdata,
                       testdata,
                       w2vpath="GoogleNews-vectors-negative300.bin",
                       gnews=True,
                       dim=300,
                       usePhrase=False,
                       phrasemodelpath="phrase_all.model",
                       cross_features="true"):

    stopwords = "most"
    if usePhrase == True:
        phmodel = Phrases.load(phrasemodelpath)

    #tweets, targets, labels, ids, ids_new, id_dict, id_dict_rev = readRTE(trainingdata)
    tweets_train, targets_train, labels_train, ids_train = readTweetsOfficial(
        trainingdata)

    tweet_tokens = tokenise_tweets(tweets_train, stopwords)
    target_tokens = tokenise_tweets(targets_train, stopwords)

    if usePhrase == True:
        tweet_tokens = phmodel[tweet_tokens]
        target_tokens = phmodel[target_tokens]

    #tweets_test, targets_test, labels_test, ids_test, ids_test_new, id_dict_test, id_dict_rev_test = readRTE(testdata)
    tweets_test, targets_test, labels_test, ids_test = readTweetsOfficial(
        testdata)

    tweet_tokens_test = tokenise_tweets(tweets_test, stopwords)
    target_tokens_test = tokenise_tweets(targets_test, stopwords)

    if usePhrase == True:
        tweet_tokens_test = phmodel[tweet_tokens_test]
        target_tokens_test = phmodel[target_tokens_test]

    if gnews == True:
        w2vmodel = word2vec.Word2Vec.load_word2vec_format(w2vpath, binary=True)
    else:
        w2vmodel = word2vec.Word2Vec.load(w2vpath)

    features_train_w2v_tweet = encodeSentW2V(w2vmodel, tweet_tokens, dim)
    features_train_w2v_targ = encodeSentW2V(w2vmodel, target_tokens, dim)

    features_dev_w2v_tweet = encodeSentW2V(w2vmodel, tweet_tokens_test, dim)
    features_dev_w2v_target = encodeSentW2V(w2vmodel, target_tokens_test, dim)

    features_train_w2v = extrFeaturesW2V(features_train_w2v_tweet,
                                         features_train_w2v_targ,
                                         cross_features)
    features_dev_w2v = extrFeaturesW2V(features_dev_w2v_tweet,
                                       features_dev_w2v_target, cross_features)

    return features_train_w2v, labels_train, features_dev_w2v, labels_test
def load_ngrams_models():
    global bigrams_model
    global trigrams_model
    bigrams_model_name = join(dirname(__file__),
                              "../texttoolkit/models/bigrams_phraser.bin")
    if exists(bigrams_model_name):
        if not bigrams_model:
            bigrams_model = Phrases.load(bigrams_model_name)
    else:
        print(
            "oops, couldn't find `models/bigrams_phraser.bin`. Try rerunning `$ bash setup.sh`"
        )
        exit(1)

    trigrams_model_name = join(dirname(__file__),
                               "../texttoolkit/models/trigrams_phraser.bin")
    if exists(trigrams_model_name):
        if not trigrams_model:
            trigrams_model = Phrases.load(trigrams_model_name)
    else:
        print(
            "oops, couldn't find `models/trigrams_phraser.bin`. Try rerunning `$ bash setup.sh`"
        )
        exit(1)
Пример #19
0
def phrases():
    print('Loading phrases model...')
    bigram = Phrases.load('data/nyt/bigram_model.phrases')

    print('Creating listener...')
    address = ('localhost', 6001)
    with Listener(address, authkey=b'password') as listener:
        while True:
            with listener.accept() as conn:
                print('connection accepted from {0}'.format(listener.last_accepted))
                while True:
                    try:
                        msg = conn.recv()
                        conn.send(bigram[msg])
                    except (EOFError, ConnectionResetError):
                        break
Пример #20
0
def load_transformer_list():
    output_directory = 'vocabularies'
    output_basename = 'en_embeddings_200M_200d'

    path = os.path.join(output_directory, output_basename)
    config_fname = os.path.join(path, 'config.json')
    with open(config_fname, 'r') as json_data:
        wemb_config = json.load(json_data)
        ngrams = wemb_config['ngrams']

        transformers = []
        for i in range(ngrams - 1):
            phrase_model = Phrases.load(os.path.join(path, '{}gram'.format(i)))
            transformers.append(phrase_model)

    return transformers
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}


    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0


        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc/neutcnt
        possc_tweet = possc/poscnt
        negsc_tweet = negsc/negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
Пример #22
0
    def __init__(self, remote):
        global _phrases
        global _phrases_conn

        self.remote = remote
        if not remote and _phrases is None:
            print('Loading phrases model...')

            # Trained on 100-200k NYT articles
            _phrases = Phrases.load('data/nyt/bigram_model.phrases')
            print('Done loading phrases')
        elif _phrases_conn is None:
            print('Connecting to phrases process...')
            address = ('localhost', 6001)
            _phrases_conn = Client(address, authkey=b'password')
            print('Done connecting to phrases')
        self.conn = _phrases_conn
def load_phrasers(model_dir):
    """

    """
    phraser_files = sorted(glob(f"{model_dir}/phrasers/*.phraser"))
    if len(phraser_files) == 0:
        raise FileNotFoundError(
            "No phrasers found in the given model directory.")
    phrasers = []
    for pf in phraser_files:
        pf_ngram = int(os.path.basename(pf).split(".phraser")[0])
        pf_phraser = Phrases.load(pf)
        phrasers.append((pf_ngram, pf_phraser))
    phrasers = sorted(phrasers, key=lambda x: x[0])
    ngrams = [p[0] for p in phrasers]
    phrasers = [p[1] for p in phrasers]
    return phrasers, ngrams
Пример #24
0
def phrases():
    print('Loading phrases model...')
    bigram = Phrases.load('data/nyt/bigram_model.phrases')

    print('Creating listener...')
    address = ('localhost', 6001)
    with Listener(address, authkey=b'password') as listener:
        while True:
            with listener.accept() as conn:
                print('connection accepted from {0}'.format(
                    listener.last_accepted))
                while True:
                    try:
                        msg = conn.recv()
                        conn.send(bigram[msg])
                    except (EOFError, ConnectionResetError):
                        break
Пример #25
0
    def __init__(self, remote):
        global _phrases
        global _phrases_conn

        self.remote = remote
        if not remote and _phrases is None:
            print('Loading phrases model...')

            # Trained on 100-200k NYT articles
            _phrases = Phrases.load('data/nyt/bigram_model.phrases')
            print('Done loading phrases')
        elif _phrases_conn is None:
            print('Connecting to phrases process...')
            address = ('localhost', 6001)
            _phrases_conn = Client(address, authkey=b'password')
            print('Done connecting to phrases')
        self.conn = _phrases_conn
Пример #26
0
def generate_bow(corpus_filename, use_bigrams, no_above, no_below):

    tokens = [
        utils.tokenize(line)
        for line in open('./data/%s.csv' % corpus_filename)
    ]
    print 'First token', tokens[1]

    if use_bigrams:

        if not os.path.exists('./data/bigram.bin'):
            print "data/bigram.bin doesn't exist. Generating and saving bigram model. This could take a while."
            bigram = Phrases(
                tokenize(line)
                for line in open('./data/%s.csv' % corpus_filename))
            bigram.save('./data/bigram.bin')

        bigram = Phrases.load('./data/bigram.bin')
        tokens = [bigram[token] for token in tokens]
        print 'First bigram token', tokens[1]

    #Make the dictionary, a collection of statistics about all tokens in the corpus
    #This is the mapping from words to their id's. It's the lookup table for features.
    dictionary = corpora.Dictionary(tokens)

    # words that appear only once
    dictionary.filter_extremes(
        no_above, no_below)  #no_above=0.05, no_below=10 yielded good results
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()

    # store the dictionary, for future reference
    dictionary.save('./data/%s.dict' % corpus_filename)

    # memory-friendly bag-of-words class
    class BOW(object):
        def __iter__(self):
            for token in tokens:
                # assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(token)

    # Now we can make a bag of words and do something with it by iterating over it
    arxiv_bow = BOW()
    corpora.MmCorpus.serialize('./data/%s.mm' % corpus_filename,
                               arxiv_bow)  # store to disk, for later use
Пример #27
0
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}

    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0

        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc / neutcnt
        possc_tweet = possc / poscnt
        negsc_tweet = negsc / negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t",
              possc_tweet, "\t", negsc_tweet)
Пример #28
0
def train():
    sents = LineSentence(args.sents)

    bi_path = os.path.join(args.save_dir, bi_model)
    print 'Bigram: ', bi_path
    if os.path.exists(bi_path):
        bigram = Phrases.load(bi_path)
    else:
        bigram = Phrases(sents,
                         min_count=args.min_count,
                         threshold=args.bi_threshold)
        bigram.save(bi_path)

    tri_path = os.path.join(args.save_dir, bi_model + '_' + tri_model)
    print 'Trigram: ', tri_path
    trigram = Phrases(bigram[sents],
                      min_count=args.min_count,
                      threshold=args.tri_threshold)
    trigram.save(tri_path)
Пример #29
0
def text2words_to_csv(dataDirectory, fname, bigrams=False):
    bigram = False
    if bigrams:
        bigram = Phrases.load("bigrams")
    for filename in os.listdir(dataDirectory):
        if filename.endswith(".xml"):
            fd = open(fname, 'a', encoding='utf8')
            path = os.path.join(dataDirectory, filename)
            eligibility = retrieve_info(path, ['eligibility'])
            if eligibility.__len__() > 0:
                for line in sentencesSplitter(eligibility['eligibility']):
                    line = clean(line,
                                 convertnum2words=True,
                                 removeSingles=False)
                    if bigrams:
                        line = bigram[line.split()]
                    fd.write(" ".join(line) + " ")

            fd.close()
def extractFeaturesW2V(w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev = False):

    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodel)
    features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train)
    features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev)

    return features_train_w2v, labels_train, features_dev_w2v, labels_dev
Пример #31
0



from gensim.models import Phrases
from nytnlp.keywords import rake
from textblob import Blobber
from textblob_aptagger import PerceptronTagger
blob = Blobber(pos_tagger=PerceptronTagger())
stops = stopwords.words('english')
lem = WordNetLemmatizer()
dash_map = {ord(p): ' ' for p in '—-'}
punct_map = {ord(p): '' for p in string.punctuation + '“”—’‘'}

# Trained on 100-200k NYT articles
bigram = Phrases.load('data/bigram_model.phrases')

def clean_doc(doc):
    doc = doc.lower()
    doc = doc.replace('\'s ', ' ')
    doc = doc.translate(dash_map)
    doc = doc.translate(punct_map)
    return doc


def keyword_tokenize(doc):
    """
    Tokenizes a document so that only keywords and phrases
    are returned. Keywords are returned as lemmas.
    """
    doc = clean_doc(doc)
def extractFeaturesMulti(features=["auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase"]
        , automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model",
        useDev=True):
    if useDev==False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

    features_final = []

    if features.__contains__("bow"):
        features_final = extractFeatureVocab(tweets_train)
        features_train = extractFeaturesBOW(tweets_train, targets_train, features_final)
        features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final)
    elif features.__contains__("targetInTweet"):
        features_train = extractFeaturesCrossTweetTarget(tweets_train, targets_train)
        features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")

    if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
        if features.__contains__("bow_phrase"):
            features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True)
        elif features.__contains__("bow_phrase_anon"):
            features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True)
        features_final.extend(features_vocab)

    if features.__contains__("auto_added"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph)
    elif features.__contains__("auto_true"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph)
    elif features.__contains__("auto_false"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph)

    targetInTweetTrain = []
    targetInTweetDev = []
    if features.__contains__("targetInTweet") and features.__contains__("bow"):
        targetInTweetTrain = extractFeaturesCrossTweetTarget(tweets_train, targets_train)
        targetInTweetDev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")
    if features.__contains__("emoticons"):
        emoticons_train, emoticons_vocab = extractEmoticons(tweets_train)
        emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev)
        for emo in emoticons_vocab:
            features_final.append("Emoticon_" + emo)
    if features.__contains__("affect"):
        affect_train, affect_vocab = getAffect(tweets_train)
        affect_dev, affect_vocab = getAffect(tweets_dev)
        for aff in affect_vocab:
            features_final.append("WNaffect_" + aff)

    if features.__contains__("hash"):
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev)
    elif features.__contains__("w2v_hash"): # this contains hash
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev)

    # combine features
    for i, featvec in enumerate(features_train):#features_train_auto)
        if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"):
            features_train[i] = np.append(features_train[i], features_train_auto[i])  # numpy append works as extend works for python lists
        if features.__contains__("targetInTweet") and features.__contains__("bow"):
            features_train[i] = np.append(features_train[i], targetInTweetTrain[i])
        if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
            features_train[i] = np.append(features_train[i], features_train_phrbow[i])
        if features.__contains__("emoticons"):
            features_train[i] = np.append(features_train[i], emoticons_train[i])
        if features.__contains__("affect"):
            features_train[i] = np.append(features_train[i], affect_train[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_train[i] = np.append(features_train[i], features_train_w2v[i])
    for i, featvec in enumerate(features_dev):#features_dev_auto):
        if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"):
            features_dev[i] = np.append(features_dev[i], features_dev_auto[i])
        if features.__contains__("targetInTweet") and features.__contains__("bow"):
            features_dev[i] = np.append(features_dev[i], targetInTweetDev[i])
        if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
            features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i])
        if features.__contains__("emoticons"):
            features_dev[i] = np.append(features_dev[i], emoticons_dev[i])
        if features.__contains__("affect"):
            features_dev[i] = np.append(features_dev[i], affect_dev[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_dev[i] = np.append(features_dev[i], features_dev_w2v[i])


    return features_train, labels_train, features_dev, labels_dev, features_final
  w2v_model = Word2Vec.load(model_filepath)  # C binary format
except IndexError:
  print("using default model")
  current_dir = os.path.dirname(__file__)
  model_filepath = os.path.join(current_dir, 'model_sentences_raw_words_trigrams_min_count_50_size_200_downsampling_0.001.bin')
  w2v_model = Word2Vec.load(model_filepath)  # C binary format
print("using model from " + model_filepath)

bigrams_model_name = 'bigrams_model_nyt_sentences_5.5M_5.bin'
trigrams_model_name = "trigrams_model_nyt_sentences_5.5M_5.bin"
ngrams_models = {
  "bigrams": bigrams_model_name,
  "trigrams": trigrams_model_name
}
which_ngrams_model = "trigrams"
ngrams_model = Phrases.load(ngrams_models[which_ngrams_model])


print("finish loading w2v" +  str(datetime.now()))
print("loading w2v took  " + str((datetime.now() - start).seconds) + " seconds")

@w2v_api.route("/")
def hello():
    return json.dumps({"loaded": True})

@w2v_api.route("/similarize/<word>")
def similarize(word):
  try:
    try: 
      similar_words = cached_synonyms[word]
    except KeyError:
Пример #34
0
def create_trigram (bigram_resume):
    trigram_model = Phrases.load(trigram_model_path)
    trigram_resume = [trigram_model[sentence] for sentence in bigram_resume]
    return trigram_resume
Пример #35
0
def create_bigram (unigram_resume):
    bigram_model = Phrases.load(bigram_model_path)
    bigram_resume = [bigram_model[sentence] for sentence in unigram_resume]
    return bigram_resume
Пример #36
0
        s = s.replace(code[1], code[0])
    return s


from gensim.models import Phrases
from nytnlp.keywords import rake
from textblob import Blobber
from textblob_aptagger import PerceptronTagger
blob = Blobber(pos_tagger=PerceptronTagger())
stops = stopwords.words('english')
lem = WordNetLemmatizer()
dash_map = {ord(p): ' ' for p in '—-'}
punct_map = {ord(p): '' for p in string.punctuation + '“”—’‘'}

# Trained on 100-200k NYT articles
bigram = Phrases.load('data/bigram_model.phrases')


def clean_doc(doc):
    doc = doc.lower()
    doc = doc.replace('\'s ', ' ')
    doc = doc.translate(dash_map)
    doc = doc.translate(punct_map)
    return doc


def keyword_tokenize(doc):
    """
    Tokenizes a document so that only keywords and phrases
    are returned. Keywords are returned as lemmas.
    """
Пример #37
0
    # LOGGING/SET-UP
    # ------------------------------------------------------------------

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("Starting %s" % ' '.join(sys.argv))

    # Inputs
    wikiTextFile = sys.argv[1]
    bigramFile = sys.argv[2]

    # Load Bigram File
    logger.info('Loading the bigram model...')
    bigram = Phrases.load(bigramFile)

    # ------------------------------------------------------------------
    # FIND SENTENCES, LOOK FOR BIGRAMS AND TRAIN TEXT
    # ------------------------------------------------------------------

    # Find the sentences
    logger.info('Accessing sentences with gensim.LineSentence...')
    sentences = LineSentence(wikiTextFile)

    # Analyse text for bigrams
    logger.info('Looking in text for bigrams...')
    newSentences = bigram[sentences]

    # Training text
    model = Word2Vec(newSentences,
Пример #38
0
import string 
import re
import numpy as np
from numpy import prod, dot
from gensim.models import Doc2Vec, Phrases
root = settings.root_path
big_file_dir = os.path.expanduser('~')+'/model/corpra/'
if sys.platform=='darwin':
    root = root.replace(os.path.expanduser('~'),
                        os.path.expanduser('~')+'/Dropbox')

########################################################################
# Find nearest neighbors in product space
#######################################################################
model = Doc2Vec.load(root+"model/movie_space/idf_reddit")
bigram = Phrases.load(big_file_dir+'movies_bigram_large.p','rb')
book_data = pickle.load( open(root+"model/movie_space/book_meta_data.p", "rb" ) )
title2asin = pickle.load( open(root+"model/movie_space/title2asin.p", "rb" ) )

def get_similar(query_book, pos_words, neg_words, topn=100):
    try:
        pos_vecs = []
        all_query_words = []
        for book in query_book:
            if book in title2asin:
                print "\tFound book: ", title2asin[book]            
                all_query_words.append(title2asin[book])
                pos_vecs.append(model.docvecs[title2asin[book]])

        for word in bigram[pos_words.replace(',', ' ').lower().split()]:
            if word in model:
Пример #39
0
    if not 0<=k<=len(seq):
        for e in seq:
            yield e
    else:
        numbersPicked = 0
        for i,number in enumerate(seq):
            prob = (k-numbersPicked)/(len(seq)-i)
            if random.random() < prob:
                yield number
                numbersPicked += 1

f = open("tokenizer.pk", "rb")
tokenizer = pickle.load(f)
f.close()

bigram = Phrases.load('bigrams.pk')
trigram = Phrases.load('trigrams.pk')
ngram = Phrases.load('ngrams.pk')

print 'SemEval data'
for semeval_file in semeval_files:
    print 'File', semeval_file
    with open(semeval_file, 'r') as f:
        st = []
        for line in f:
            st += [line.strip()]
        text = read_visit_sem(st)
        text = [nltk.word_tokenize(s.lower()) for s in tokenizer.tokenize(text)]
        text = ngram[trigram[bigram[text]]]
        for sent in text:
            print '->', ' '.join(sent)
Пример #40
0
if len(user_input_text) < 50:
    parser.error('Input text must be more than 50 words.')

######################################################################
############################ Set up model ############################
######################################################################

# load the finished dictionary from disk

model_dir = 'models/' + args.model_version

trigram_dictionary = Dictionary.load(
    os.path.join(model_dir, 'trigram_dictionary.dict'))

bigram_model = Phrases.load(os.path.join(model_dir, 'bigram_model_pos'))
trigram_model = Phrases.load(os.path.join(model_dir, 'trigram_model_pos'))

# load the finished LDA model from disk
lda = LdaModel.load(os.path.join(model_dir, 'lda_alpha_eta_auto_27'))

topic_names = {
    1: u'Consulting and Contracting',
    2: u'DevOps',
    3: u'* Meta Job Description Topic: Students and Education',
    4: u'Finance and Risk',
    5: u'* Meta Job Description Topic: Benefits',
    6: u'* Meta Job Description Topic: Facebook Advertising',
    7: u'Aerospace and Flight Technology',
    8: u'* Meta Job Description Topic: Soft Skills',
    9: u'Product Management',
Пример #41
0
    def __init__(self):
        super().__init__(multithreaded=False)

        print('Loading phrases model...')
        self.bigram = Phrases.load('data/bigram_model.phrases')