def get_trigrams(self): """ Builds unigram, bigram, and trigram models respectively. Writes the text of each model to a seperate file. """ unigram_sentences = LineSentence(self.unigram_sentences_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(self.bigram_model_filepath) bigram_model = Phrases.load(self.bigram_model_filepath) with open(self.bigram_sentences_filepath, 'w', encoding="utf-8") as f: for unigram_sentence in unigram_sentences: bigram_sent = " ".join(bigram_model[unigram_sentence]) # a bit confused by this. f.write(bigram_sent) bigram_sentences = LineSentence(self.bigram_sentences_filepath) trigram_model = Phrases(bigram_sentences) trigram_model.save(self.trigram_model_filepath) trigram_model = Phrases.load(self.trigram_model_filepath) with open(self.trigram_sentences_filepath, 'w', encoding="utf-8") as f: for bigram_sentence in bigram_sentences: trigram_sentence = " ".join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence(self.trigram_sentences_filepath) with open(self.trigram_articles_filepath, 'w', encoding="utf-8") as f: for parsed_article in self.line_article("../data/article_texts"): unigram_article = [token.lemma_ for token in self.nlp(parsed_article) if not self.punct_space(token)] bigram_article = bigram_model[unigram_article] trigram_article = trigram_model[bigram_article] trigram_article = [term for term in trigram_article if term not in STOP_WORDS] trigram_article = " ".join(trigram_article) f.write(trigram_article + '\n')
def train_with_trigrams(self): trigram_model = Phrases.load(self.trigram_model_filepath) bigram_model = Phrases.load(self.bigram_model_filepath) for doc, id in self.es_docs(): unigrams = text_cleaner.clean_tokens(doc) bigrams = bigram_model[unigrams] trigrams = trigram_model[bigrams] trigrams = text_cleaner.filter_terms(trigrams) td = TaggedDocument(trigrams, [id]) self.taggeddoc.append(td) print('Data Loading finished') print(len(self.taggeddoc), type(self.taggeddoc)) model = gensim.models.Doc2Vec(self.taggeddoc, dm=0, iter=1, window=15, seed=1337, min_count=5, workers=4, alpha=0.025, size=200, min_alpha=0.025) for epoch in range(200): if epoch % 20 == 0: print('Now training epoch %s' % epoch) model.train(self.taggeddoc, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.save(self.model_file) model.save_word2vec_format(self.model_file + '.word2vec')
def getTopics(jobs_): bigram_model = Phrases.load('data/bigram_model_all') trigram_model = Phrases.load('data/trigram_model_all') trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict') lda = LdaMulticore.load('data/lda_model_all') topic_names = {0:u'Risk Management Bank', 1:u'Big Data Report', 2:u'Automotive SAP', 3:u'Microsoft Java Scrum', 4:u'Medical Consultant', 5:u'Java Engineer', 6:u'Computer Vision Developer', 7:u'Data Analyst', 8:u'BI SAP BW', 9:u'IOT Reporting R', 10:u'Global Project Presentation', 11:u'Cloud Engineer IOT', 12:u'Industry 4.0', 13:u'Risk Consulting', 14:u'Machine Learning Data Science'} topics_ = [] for job_ in jobs_: if job_ is not None: #print(job_[0]) topics_.append(lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, job_[1], job_[0]))
def get_test_reviews(): doc_reviews = {} sent_reivews = {} num_docs = 0 num_words = 0 apk_path = os.path.join("..", "data", "raw") apk_lst_path = os.path.join(apk_path, "package_names.txt") # load phrases bigram = Phrases.load(os.path.join("..", "model", "bigram.model")) trigram = Phrases.load(os.path.join("..", "model", "trigram.model")) with open(apk_lst_path) as fin: apk_lst = [apk_name.strip() for apk_name in fin.readlines()] for apk_name in apk_lst: file = os.path.join(apk_path, "mongodb", apk_name, "review.txt") with open(file) as fin: reviews_sent = [] reviews_doc = [] for line in fin.readlines(): words_sents, wc = extractSentenceWords(line) reviews_sent.append(words_sents) reviews_doc.append(list(itertools.chain.from_iterable(words_sents))) num_docs += 1 num_words += wc sent_reivews[apk_name] = trigram[bigram[reviews_sent]] doc_reviews[apk_name] = trigram[bigram[reviews_doc]] logging.info("Read %d docs, %d words!" % (num_docs, num_words)) return sent_reivews, doc_reviews
def load_model(): bigram = Phrases.load(os.path.join("..", "model", "bigram.model")) trigram = Phrases.load(os.path.join("..", "model", "trigram.model")) wv_model = Word2Vec.load( os.path.join("..", "model", "appreviews_word2vec.model")) logging.info("Load word2vec model finished") return bigram, trigram, wv_model
def save_sentences_trigram(self): f = open(self.trigram_sentences_filepath, 'w') trigram_model = Phrases.load(self.trigram_model_filepath) bigram_model = Phrases.load(self.bigram_model_filepath) for doc, id in self.es_docs(): unigrams = text_cleaner.clean_tokens(doc) bigrams = bigram_model[unigrams] trigrams = trigram_model[bigrams] trigrams = text_cleaner.filter_terms(trigrams) trigrams = u' '.join(trigrams) f.write(trigrams + '\n')
def trigrams(corpus, output_prefix): print("----- Trigrams -----") if os.path.exists(output_prefix + "_trigram_phrases"): trigram_phrases = Phrases.load(output_prefix + "_trigram_phrases") print("Loaded trigram phrases") else: bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"]) trigram_phrases = Phrases(bigram_phrases[corpus], min_count=CONFIG["trigram_phrase_min_count"], threshold=CONFIG["trigram_phrase_threshold"], delimiter=CONFIG["trigram_phrase_delimiter"]) trigram_phrases.save(output_prefix + "_trigram_phrases") trigram_transformer = Phraser(trigram_phrases) dct = Dictionary(trigram_transformer[corpus]) dct.save(output_prefix + "_dictionary_trigram") print("Training tf-idf from trigrams") bow_corpus = [dct.doc2bow(line) for line in trigram_transformer[corpus]] tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc') tfidf.save(output_prefix + "_tfidf_trigram") print("Training word2vec model with trigram") start_time = time() trigram_model = gensim.models.Word2Vec(trigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'], min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'], negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'], iter=CONFIG['train_epoch']) trigram_model.save(output_prefix + "_trigram") print("Time :", format_time(time() - start_time)) return trigram_model
def __getitem__(self, word): global _phrases # If a phrases model is already loaded, just use that if _phrases is not None: self.conn = None # Otherwise, try to connect to the separate process. # Fall back to loading the phrase model here elif not hasattr(self, 'conn'): try: print('Connecting to phrases process...') address = ('localhost', 6001) self.conn = Client(address, authkey=b'password') print('Done connecting to phrases') except ConnectionRefusedError: self.conn = None print('Could not connect to phrases process,') print('Loading phrases model...') _phrases = Phrases.load('data/bigram_model.phrases') print('Done loading phrases') if self.conn is not None: self.conn.send(word) return self.conn.recv() else: return _phrases[word]
def main(): # ------------------------------------------------------------------------------- # Parameters # the script will most likely work if we swap the TEXTS variable # with any iterable of text (where one element represents a document, # and the whole iterable is the corpus) newsgroups_train = fetch_20newsgroups(subset='train') TEXTS = newsgroups_train.data # spacy's english model for text preprocessing NLP = spacy.load('en') # a set of stopwords built-in to spacy, we can always # expand this set for the problem that we are working on, # here we include python built-in string punctuation mark STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set( ENGLISH_STOP_WORDS) # create a directory called 'model' to store all outputs in later section MODEL_DIR = 'model' UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt') PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model') BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt') WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec') # ------------------------------------------------------------------------------- logger.info('job started') if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if not os.path.exists(UNIGRAM_PATH): logger.info('preprocessing text') export_unigrams(UNIGRAM_PATH, texts=TEXTS, parser=NLP, stopwords=STOPWORDS) if os.path.exists(PHRASE_MODEL_CHECKPOINT): phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT) else: logger.info('training phrase model') # use LineSetence to stream text as oppose to loading it all into memory unigram_sentences = LineSentence(UNIGRAM_PATH) phrase_model = Phrases(unigram_sentences) phrase_model.save(PHRASE_MODEL_CHECKPOINT) if not os.path.exists(BIGRAM_PATH): logger.info('converting words to phrases') export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model) if os.path.exists(WORD2VEC_CHECKPOINT): word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT) else: logger.info('training word2vec') sentences = LineSentence(BIGRAM_PATH) word2vec = Word2Vec(sentences, workers=cpu_count()) word2vec.save(WORD2VEC_CHECKPOINT) logger.info('job completed')
def extractFeaturesW2V( w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev=False): if useDev == False: tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILETEST, 'windows-1252', 2) phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train) features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev) return features_train_w2v, labels_train, features_dev_w2v, labels_dev
def generate_bow(corpus_filename, category, use_bigrams, no_above, no_below): if not os.path.exists('./data/%s' % category): os.makedirs('./data/%s' % category) tokens = [ utils.tokenize(line) for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')) if category in label ] print 'First token', tokens[1] category_filename = corpus_filename.replace('corpus', 'category') #Each category gets its own dictionary and its own corpus, but uses the same bigram model #that was computed on all the abstracts if use_bigrams: if not os.path.exists('./data/%s/bigram.bin' % category): bigram = Phrases( utils.tokenize(line) for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')) if category in label) Phrases.save(bigram, './data/%s/bigram.bin' % category) else: bigram = Phrases.load('./data/%s/bigram.bin') tokens = [bigram[token] for token in tokens] print 'First bigram token', tokens[1] #Make the dictionary, a collection of statistics about all tokens in the corpus #This is the mapping from words to their id's. It's the lookup table for features. dictionary = corpora.Dictionary(tokens) # words that appear only once dictionary.filter_extremes( no_above, no_below) #no_above=0.05, no_below=10 yielded good results # remove gaps in id sequence after words that were removed dictionary.compactify() # store the dictionary, for future reference dictionary.save('./data/%s/%s.dict' % (category, category_filename)) # memory-friendly bag-of-words class class BOW(object): def __iter__(self): for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')): # assume there's one document per line, tokens separated by whitespace if category in label: yield dictionary.doc2bow(utils.tokenize(line)) else: pass # Now we can make a bag of words and do something with it by iterating over it arxiv_bow = BOW() corpora.MmCorpus.serialize('./data/%s/%s.mm' % (category, category_filename), arxiv_bow) # store to disk, for later use
def create_bigram(unigram_resume): bigram_model = Phrases.load( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', 'www', 'Parsing', 'bigram_model'))) #bigram_model.add_vocab(unigram_resume) bigram_resume = [bigram_model[sentence] for sentence in unigram_resume] return bigram_resume
def create_trigram(bigram_resume): trigram_model = Phrases.load( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', 'www', 'Parsing', 'trigram_model'))) #trigram_model.add_vocab(bigram_resume) trigram_resume = [trigram_model[sentence] for sentence in bigram_resume] return trigram_resume
def get_bigram_model(): model_exists = os.path.exists(bigram_model_filepath) if model_exists: bigram_model = Phrases.load(bigram_model_filepath) else: unigram_sentences = get_unigram_sentences() bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) return bigram_model
def main(): # ------------------------------------------------------------------------------- # Parameters # the script will most likely work if we swap the TEXTS variable # with any iterable of text (where one element represents a document, # and the whole iterable is the corpus) newsgroups_train = fetch_20newsgroups(subset = 'train') TEXTS = newsgroups_train.data # spacy's english model for text preprocessing NLP = spacy.load('en') # a set of stopwords built-in to spacy, we can always # expand this set for the problem that we are working on, # here we include python built-in string punctuation mark STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(ENGLISH_STOP_WORDS) # create a directory called 'model' to store all outputs in later section MODEL_DIR = 'model' UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt') PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model') BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt') WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec') # ------------------------------------------------------------------------------- logger.info('job started') if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if not os.path.exists(UNIGRAM_PATH): logger.info('preprocessing text') export_unigrams(UNIGRAM_PATH, texts = TEXTS, parser = NLP, stopwords = STOPWORDS) if os.path.exists(PHRASE_MODEL_CHECKPOINT): phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT) else: logger.info('training phrase model') # use LineSetence to stream text as oppose to loading it all into memory unigram_sentences = LineSentence(UNIGRAM_PATH) phrase_model = Phrases(unigram_sentences) phrase_model.save(PHRASE_MODEL_CHECKPOINT) if not os.path.exists(BIGRAM_PATH): logger.info('converting words to phrases') export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model) if os.path.exists(WORD2VEC_CHECKPOINT): word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT) else: logger.info('training word2vec') sentences = LineSentence(BIGRAM_PATH) word2vec = Word2Vec(sentences, workers = cpu_count()) word2vec.save(WORD2VEC_CHECKPOINT) logger.info('job completed')
def train_phrase_model(cleaned_filepath, bigram_model_filepath, run_or_load_flag): if run_or_load_flag: unigram_sentences = LineSentence(cleaned_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) else: bigram_model = Phrases.load(bigram_model_filepath) return bigram_model
def extractFeaturesW2V(trainingdata, testdata, w2vpath="GoogleNews-vectors-negative300.bin", gnews=True, dim=300, usePhrase=False, phrasemodelpath="phrase_all.model", cross_features="true"): stopwords = "most" if usePhrase == True: phmodel = Phrases.load(phrasemodelpath) #tweets, targets, labels, ids, ids_new, id_dict, id_dict_rev = readRTE(trainingdata) tweets_train, targets_train, labels_train, ids_train = readTweetsOfficial( trainingdata) tweet_tokens = tokenise_tweets(tweets_train, stopwords) target_tokens = tokenise_tweets(targets_train, stopwords) if usePhrase == True: tweet_tokens = phmodel[tweet_tokens] target_tokens = phmodel[target_tokens] #tweets_test, targets_test, labels_test, ids_test, ids_test_new, id_dict_test, id_dict_rev_test = readRTE(testdata) tweets_test, targets_test, labels_test, ids_test = readTweetsOfficial( testdata) tweet_tokens_test = tokenise_tweets(tweets_test, stopwords) target_tokens_test = tokenise_tweets(targets_test, stopwords) if usePhrase == True: tweet_tokens_test = phmodel[tweet_tokens_test] target_tokens_test = phmodel[target_tokens_test] if gnews == True: w2vmodel = word2vec.Word2Vec.load_word2vec_format(w2vpath, binary=True) else: w2vmodel = word2vec.Word2Vec.load(w2vpath) features_train_w2v_tweet = encodeSentW2V(w2vmodel, tweet_tokens, dim) features_train_w2v_targ = encodeSentW2V(w2vmodel, target_tokens, dim) features_dev_w2v_tweet = encodeSentW2V(w2vmodel, tweet_tokens_test, dim) features_dev_w2v_target = encodeSentW2V(w2vmodel, target_tokens_test, dim) features_train_w2v = extrFeaturesW2V(features_train_w2v_tweet, features_train_w2v_targ, cross_features) features_dev_w2v = extrFeaturesW2V(features_dev_w2v_tweet, features_dev_w2v_target, cross_features) return features_train_w2v, labels_train, features_dev_w2v, labels_test
def load_ngrams_models(): global bigrams_model global trigrams_model bigrams_model_name = join(dirname(__file__), "../texttoolkit/models/bigrams_phraser.bin") if exists(bigrams_model_name): if not bigrams_model: bigrams_model = Phrases.load(bigrams_model_name) else: print( "oops, couldn't find `models/bigrams_phraser.bin`. Try rerunning `$ bash setup.sh`" ) exit(1) trigrams_model_name = join(dirname(__file__), "../texttoolkit/models/trigrams_phraser.bin") if exists(trigrams_model_name): if not trigrams_model: trigrams_model = Phrases.load(trigrams_model_name) else: print( "oops, couldn't find `models/trigrams_phraser.bin`. Try rerunning `$ bash setup.sh`" ) exit(1)
def phrases(): print('Loading phrases model...') bigram = Phrases.load('data/nyt/bigram_model.phrases') print('Creating listener...') address = ('localhost', 6001) with Listener(address, authkey=b'password') as listener: while True: with listener.accept() as conn: print('connection accepted from {0}'.format(listener.last_accepted)) while True: try: msg = conn.recv() conn.send(bigram[msg]) except (EOFError, ConnectionResetError): break
def load_transformer_list(): output_directory = 'vocabularies' output_basename = 'en_embeddings_200M_200d' path = os.path.join(output_directory, output_basename) config_fname = os.path.join(path, 'config.json') with open(config_fname, 'r') as json_data: wemb_config = json.load(json_data) ngrams = wemb_config['ngrams'] transformers = [] for i in range(ngrams - 1): phrase_model = Phrases.load(os.path.join(path, '{}gram'.format(i))) transformers.append(phrase_model) return transformers
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc/neutcnt possc_tweet = possc/poscnt negsc_tweet = negsc/negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def __init__(self, remote): global _phrases global _phrases_conn self.remote = remote if not remote and _phrases is None: print('Loading phrases model...') # Trained on 100-200k NYT articles _phrases = Phrases.load('data/nyt/bigram_model.phrases') print('Done loading phrases') elif _phrases_conn is None: print('Connecting to phrases process...') address = ('localhost', 6001) _phrases_conn = Client(address, authkey=b'password') print('Done connecting to phrases') self.conn = _phrases_conn
def load_phrasers(model_dir): """ """ phraser_files = sorted(glob(f"{model_dir}/phrasers/*.phraser")) if len(phraser_files) == 0: raise FileNotFoundError( "No phrasers found in the given model directory.") phrasers = [] for pf in phraser_files: pf_ngram = int(os.path.basename(pf).split(".phraser")[0]) pf_phraser = Phrases.load(pf) phrasers.append((pf_ngram, pf_phraser)) phrasers = sorted(phrasers, key=lambda x: x[0]) ngrams = [p[0] for p in phrasers] phrasers = [p[1] for p in phrasers] return phrasers, ngrams
def phrases(): print('Loading phrases model...') bigram = Phrases.load('data/nyt/bigram_model.phrases') print('Creating listener...') address = ('localhost', 6001) with Listener(address, authkey=b'password') as listener: while True: with listener.accept() as conn: print('connection accepted from {0}'.format( listener.last_accepted)) while True: try: msg = conn.recv() conn.send(bigram[msg]) except (EOFError, ConnectionResetError): break
def generate_bow(corpus_filename, use_bigrams, no_above, no_below): tokens = [ utils.tokenize(line) for line in open('./data/%s.csv' % corpus_filename) ] print 'First token', tokens[1] if use_bigrams: if not os.path.exists('./data/bigram.bin'): print "data/bigram.bin doesn't exist. Generating and saving bigram model. This could take a while." bigram = Phrases( tokenize(line) for line in open('./data/%s.csv' % corpus_filename)) bigram.save('./data/bigram.bin') bigram = Phrases.load('./data/bigram.bin') tokens = [bigram[token] for token in tokens] print 'First bigram token', tokens[1] #Make the dictionary, a collection of statistics about all tokens in the corpus #This is the mapping from words to their id's. It's the lookup table for features. dictionary = corpora.Dictionary(tokens) # words that appear only once dictionary.filter_extremes( no_above, no_below) #no_above=0.05, no_below=10 yielded good results # remove gaps in id sequence after words that were removed dictionary.compactify() # store the dictionary, for future reference dictionary.save('./data/%s.dict' % corpus_filename) # memory-friendly bag-of-words class class BOW(object): def __iter__(self): for token in tokens: # assume there's one document per line, tokens separated by whitespace yield dictionary.doc2bow(token) # Now we can make a bag of words and do something with it by iterating over it arxiv_bow = BOW() corpora.MmCorpus.serialize('./data/%s.mm' % corpus_filename, arxiv_bow) # store to disk, for later use
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc / neutcnt possc_tweet = possc / poscnt negsc_tweet = negsc / negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def train(): sents = LineSentence(args.sents) bi_path = os.path.join(args.save_dir, bi_model) print 'Bigram: ', bi_path if os.path.exists(bi_path): bigram = Phrases.load(bi_path) else: bigram = Phrases(sents, min_count=args.min_count, threshold=args.bi_threshold) bigram.save(bi_path) tri_path = os.path.join(args.save_dir, bi_model + '_' + tri_model) print 'Trigram: ', tri_path trigram = Phrases(bigram[sents], min_count=args.min_count, threshold=args.tri_threshold) trigram.save(tri_path)
def text2words_to_csv(dataDirectory, fname, bigrams=False): bigram = False if bigrams: bigram = Phrases.load("bigrams") for filename in os.listdir(dataDirectory): if filename.endswith(".xml"): fd = open(fname, 'a', encoding='utf8') path = os.path.join(dataDirectory, filename) eligibility = retrieve_info(path, ['eligibility']) if eligibility.__len__() > 0: for line in sentencesSplitter(eligibility['eligibility']): line = clean(line, convertnum2words=True, removeSingles=False) if bigrams: line = bigram[line.split()] fd.write(" ".join(line) + " ") fd.close()
def extractFeaturesW2V(w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev = False): if useDev == False: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2) phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train) features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev) return features_train_w2v, labels_train, features_dev_w2v, labels_dev
from gensim.models import Phrases from nytnlp.keywords import rake from textblob import Blobber from textblob_aptagger import PerceptronTagger blob = Blobber(pos_tagger=PerceptronTagger()) stops = stopwords.words('english') lem = WordNetLemmatizer() dash_map = {ord(p): ' ' for p in '—-'} punct_map = {ord(p): '' for p in string.punctuation + '“”—’‘'} # Trained on 100-200k NYT articles bigram = Phrases.load('data/bigram_model.phrases') def clean_doc(doc): doc = doc.lower() doc = doc.replace('\'s ', ' ') doc = doc.translate(dash_map) doc = doc.translate(punct_map) return doc def keyword_tokenize(doc): """ Tokenizes a document so that only keywords and phrases are returned. Keywords are returned as lemmas. """ doc = clean_doc(doc)
def extractFeaturesMulti(features=["auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase"] , automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev=True): if useDev==False: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2) features_final = [] if features.__contains__("bow"): features_final = extractFeatureVocab(tweets_train) features_train = extractFeaturesBOW(tweets_train, targets_train, features_final) features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final) elif features.__contains__("targetInTweet"): features_train = extractFeaturesCrossTweetTarget(tweets_train, targets_train) features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): if features.__contains__("bow_phrase"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True) elif features.__contains__("bow_phrase_anon"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True) features_final.extend(features_vocab) if features.__contains__("auto_added"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph) elif features.__contains__("auto_true"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph) elif features.__contains__("auto_false"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph) targetInTweetTrain = [] targetInTweetDev = [] if features.__contains__("targetInTweet") and features.__contains__("bow"): targetInTweetTrain = extractFeaturesCrossTweetTarget(tweets_train, targets_train) targetInTweetDev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("emoticons"): emoticons_train, emoticons_vocab = extractEmoticons(tweets_train) emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev) for emo in emoticons_vocab: features_final.append("Emoticon_" + emo) if features.__contains__("affect"): affect_train, affect_vocab = getAffect(tweets_train) affect_dev, affect_vocab = getAffect(tweets_dev) for aff in affect_vocab: features_final.append("WNaffect_" + aff) if features.__contains__("hash"): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev) elif features.__contains__("w2v_hash"): # this contains hash phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev) # combine features for i, featvec in enumerate(features_train):#features_train_auto) if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"): features_train[i] = np.append(features_train[i], features_train_auto[i]) # numpy append works as extend works for python lists if features.__contains__("targetInTweet") and features.__contains__("bow"): features_train[i] = np.append(features_train[i], targetInTweetTrain[i]) if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): features_train[i] = np.append(features_train[i], features_train_phrbow[i]) if features.__contains__("emoticons"): features_train[i] = np.append(features_train[i], emoticons_train[i]) if features.__contains__("affect"): features_train[i] = np.append(features_train[i], affect_train[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_train[i] = np.append(features_train[i], features_train_w2v[i]) for i, featvec in enumerate(features_dev):#features_dev_auto): if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"): features_dev[i] = np.append(features_dev[i], features_dev_auto[i]) if features.__contains__("targetInTweet") and features.__contains__("bow"): features_dev[i] = np.append(features_dev[i], targetInTweetDev[i]) if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i]) if features.__contains__("emoticons"): features_dev[i] = np.append(features_dev[i], emoticons_dev[i]) if features.__contains__("affect"): features_dev[i] = np.append(features_dev[i], affect_dev[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_dev[i] = np.append(features_dev[i], features_dev_w2v[i]) return features_train, labels_train, features_dev, labels_dev, features_final
w2v_model = Word2Vec.load(model_filepath) # C binary format except IndexError: print("using default model") current_dir = os.path.dirname(__file__) model_filepath = os.path.join(current_dir, 'model_sentences_raw_words_trigrams_min_count_50_size_200_downsampling_0.001.bin') w2v_model = Word2Vec.load(model_filepath) # C binary format print("using model from " + model_filepath) bigrams_model_name = 'bigrams_model_nyt_sentences_5.5M_5.bin' trigrams_model_name = "trigrams_model_nyt_sentences_5.5M_5.bin" ngrams_models = { "bigrams": bigrams_model_name, "trigrams": trigrams_model_name } which_ngrams_model = "trigrams" ngrams_model = Phrases.load(ngrams_models[which_ngrams_model]) print("finish loading w2v" + str(datetime.now())) print("loading w2v took " + str((datetime.now() - start).seconds) + " seconds") @w2v_api.route("/") def hello(): return json.dumps({"loaded": True}) @w2v_api.route("/similarize/<word>") def similarize(word): try: try: similar_words = cached_synonyms[word] except KeyError:
def create_trigram (bigram_resume): trigram_model = Phrases.load(trigram_model_path) trigram_resume = [trigram_model[sentence] for sentence in bigram_resume] return trigram_resume
def create_bigram (unigram_resume): bigram_model = Phrases.load(bigram_model_path) bigram_resume = [bigram_model[sentence] for sentence in unigram_resume] return bigram_resume
s = s.replace(code[1], code[0]) return s from gensim.models import Phrases from nytnlp.keywords import rake from textblob import Blobber from textblob_aptagger import PerceptronTagger blob = Blobber(pos_tagger=PerceptronTagger()) stops = stopwords.words('english') lem = WordNetLemmatizer() dash_map = {ord(p): ' ' for p in '—-'} punct_map = {ord(p): '' for p in string.punctuation + '“”—’‘'} # Trained on 100-200k NYT articles bigram = Phrases.load('data/bigram_model.phrases') def clean_doc(doc): doc = doc.lower() doc = doc.replace('\'s ', ' ') doc = doc.translate(dash_map) doc = doc.translate(punct_map) return doc def keyword_tokenize(doc): """ Tokenizes a document so that only keywords and phrases are returned. Keywords are returned as lemmas. """
# LOGGING/SET-UP # ------------------------------------------------------------------ program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("Starting %s" % ' '.join(sys.argv)) # Inputs wikiTextFile = sys.argv[1] bigramFile = sys.argv[2] # Load Bigram File logger.info('Loading the bigram model...') bigram = Phrases.load(bigramFile) # ------------------------------------------------------------------ # FIND SENTENCES, LOOK FOR BIGRAMS AND TRAIN TEXT # ------------------------------------------------------------------ # Find the sentences logger.info('Accessing sentences with gensim.LineSentence...') sentences = LineSentence(wikiTextFile) # Analyse text for bigrams logger.info('Looking in text for bigrams...') newSentences = bigram[sentences] # Training text model = Word2Vec(newSentences,
import string import re import numpy as np from numpy import prod, dot from gensim.models import Doc2Vec, Phrases root = settings.root_path big_file_dir = os.path.expanduser('~')+'/model/corpra/' if sys.platform=='darwin': root = root.replace(os.path.expanduser('~'), os.path.expanduser('~')+'/Dropbox') ######################################################################## # Find nearest neighbors in product space ####################################################################### model = Doc2Vec.load(root+"model/movie_space/idf_reddit") bigram = Phrases.load(big_file_dir+'movies_bigram_large.p','rb') book_data = pickle.load( open(root+"model/movie_space/book_meta_data.p", "rb" ) ) title2asin = pickle.load( open(root+"model/movie_space/title2asin.p", "rb" ) ) def get_similar(query_book, pos_words, neg_words, topn=100): try: pos_vecs = [] all_query_words = [] for book in query_book: if book in title2asin: print "\tFound book: ", title2asin[book] all_query_words.append(title2asin[book]) pos_vecs.append(model.docvecs[title2asin[book]]) for word in bigram[pos_words.replace(',', ' ').lower().split()]: if word in model:
if not 0<=k<=len(seq): for e in seq: yield e else: numbersPicked = 0 for i,number in enumerate(seq): prob = (k-numbersPicked)/(len(seq)-i) if random.random() < prob: yield number numbersPicked += 1 f = open("tokenizer.pk", "rb") tokenizer = pickle.load(f) f.close() bigram = Phrases.load('bigrams.pk') trigram = Phrases.load('trigrams.pk') ngram = Phrases.load('ngrams.pk') print 'SemEval data' for semeval_file in semeval_files: print 'File', semeval_file with open(semeval_file, 'r') as f: st = [] for line in f: st += [line.strip()] text = read_visit_sem(st) text = [nltk.word_tokenize(s.lower()) for s in tokenizer.tokenize(text)] text = ngram[trigram[bigram[text]]] for sent in text: print '->', ' '.join(sent)
if len(user_input_text) < 50: parser.error('Input text must be more than 50 words.') ###################################################################### ############################ Set up model ############################ ###################################################################### # load the finished dictionary from disk model_dir = 'models/' + args.model_version trigram_dictionary = Dictionary.load( os.path.join(model_dir, 'trigram_dictionary.dict')) bigram_model = Phrases.load(os.path.join(model_dir, 'bigram_model_pos')) trigram_model = Phrases.load(os.path.join(model_dir, 'trigram_model_pos')) # load the finished LDA model from disk lda = LdaModel.load(os.path.join(model_dir, 'lda_alpha_eta_auto_27')) topic_names = { 1: u'Consulting and Contracting', 2: u'DevOps', 3: u'* Meta Job Description Topic: Students and Education', 4: u'Finance and Risk', 5: u'* Meta Job Description Topic: Benefits', 6: u'* Meta Job Description Topic: Facebook Advertising', 7: u'Aerospace and Flight Technology', 8: u'* Meta Job Description Topic: Soft Skills', 9: u'Product Management',
def __init__(self): super().__init__(multithreaded=False) print('Loading phrases model...') self.bigram = Phrases.load('data/bigram_model.phrases')