def prepare_vector(df): """ Applies all the method above to clean the data, trains and computes the word2vec mapping as well as the TF-IDF vectorizer on the same tokens as word2vec to be able to apply get_mean_vectors_idf """ df_text_abstract_split = prepare_series_text_split(df['abstract']) phrases = Phrases(df_text_abstract_split) bigram = Phraser(phrases) listsentences = df_text_abstract_split.values.tolist() model = gensim.models.Word2Vec(df_text_abstract_split, size=128, window=8, min_count=1, workers=4) model.train(bigram[listsentences],total_examples=len(df_text_abstract_split),epochs=10) word2vec = model.wv tvec_full = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None,min_df=.0025, max_df=0.4) tvec_full.fit(bigram[listsentences]) idf_weighted_vectors = get_mean_vectors_idf(bigram[listsentences],word2vec,tvec_full) df_vectorized = pd.DataFrame(idf_weighted_vectors, index=df_text_abstract_split.index) return df_vectorized
def _phrase(self, token): bigram = Phrases(token, min_count=5, threshold=100) bigram_mod = Phraser(bigram) # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100) # trigram_mod = Phraser(trigram) # return [trigram_mod[bigram_mod[doc]] for doc in token] return [bigram_mod[doc] for doc in token]
def email2phrases(email_contents): phrased_input = [] bi_gram = Phrases(email_contents, min_count=1, threshold=1) bg_phraser = Phraser(bi_gram) for sentence in email_contents: phrased_input.append(bg_phraser[sentence]) return phrased_input
def build_phrases(sentences): phrases = Phrases( sentences, min_count=2, threshold=10, ) return Phraser(phrases)
def __init__(self, df): self.sent = df.tolist() self.phrases = Phrases(self.sent, min_count=30, threshold=1) self.bigram = Phraser(self.phrases) self.sentences = self.bigram[self.sent] self.w2v_model = Word2Vec(min_count=30, window=3, size=252, sample=6e-5, alpha=0.01, # sample=1e-5 min_alpha=0.0005, negative=5, workers=multiprocessing.cpu_count()-1)
def createEmbeddingSpace(filename): # you need to remake key common phrases... # "new york" should really be "new_york" as a collective since "new" and "york" have different meanings # if they are used together vs separately # https://stackoverflow.com/questions/35716121/how-to-extract-phrases-from-corpus-using-gensim #sentencesAll = [] with open(filename, 'r') as f: sentencesAll = [line.split(" ") for line in f if line != None] #takes about ~10 min random.shuffle(sentencesAll) phrases = Phrases(sentencesAll, min_count=1, threshold=2, progress_per=10000) bigram = Phraser(phrases) sentences = bigram[sentencesAll] print(len(sentences)) #15,786,808 print(sentences[0]) # Building and Training the Model cores = multiprocessing.cpu_count() # I removed min_count... idk how to see which we not used w2v_model = Word2Vec(window=6, size=100, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores - 1) t = time() w2v_model.build_vocab(sentences, progress_per=10000) print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) #6.71 mins t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2))) print("Sentence[0]: in embedding Model {}".format(sentences[0])) print("Sentence[1]: in embedding Model {}".format(sentences[1])) print("Similarity is: {}".format( w2v_model.wv.wmdistance(sentences[0], sentences[1]))) return w2v_model
def preprocess(segments, dct=None, bigram=None): processed_segments = [] for seg in segments: processed_seg = [] for word in seg: if True in [word.is_space, word.is_stop, word.is_punct]: continue word = word.lemma_ word = word.lower() processed_seg.append(word) processed_segments.append(processed_seg) if bigram is None: phrases = Phrases(processed_segments, min_count=3, threshold=3) bigram = Phraser(phrases) processed_segments = bigram[processed_segments] if dct is None: dct = Dictionary(processed_segments) else: dct.add_documents(processed_segments) return [dct.doc2bow(line) for line in processed_segments], dct, processed_segments, bigram
def preprocess(data, ngrams=False): ''' Input: data - List of articles/titles ngrams - Train own n-grams using Gensim's Phraser or incorporate pretrained (applies to Google News Word2Vec) Output: List of tokenized words for each title/article ''' #remove links processed_data = [re.sub(r'^https?:\/\/.*?[\r\n\s]+', '', article, flags=re.MULTILINE) for article in data] #remove punctuation #tokenize by word tokenizer = RegexpTokenizer(r'\w+') processed_data = [tokenizer.tokenize(article) for article in processed_data] #remove stopwords stop_words = stopwords.words('english') rm_stop = [[word for word in article if word.lower() not in stop_words] for article in processed_data] #incorporate bigrams and trigrams if ngrams: bigram = Phrases(rm_stop, min_count=5, threshold=10) trigram = Phrases(bigram[rm_stop], threshold=10) trigram_mod = Phraser(trigram) with_trigram = [trigram_mod[article] for article in rm_stop] return with_trigram return rm_stop
def __entrenar_trigramas__(self, set_entrenamiento): if self.bigramas == None: return oraciones_con_bigramas = self.bigramas[set_entrenamiento] trifrases = Phrases(oraciones_con_bigramas, min_count=5, threshold=1, progress_per=10000) self.trigramas = Phraser(trifrases)
def train(args): # Output during training logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # use text8 corpus as training data, haikus dont provide sufficient context training_data = api.load('text8') # use the phrase model to recognize bigrams like "White House" or "Climate Change" bigram_model = Phrases(training_data) # Export the trained model = use less RAM, faster processing. Model updates no longer possible. bigrams = Phraser(bigram_model) # # create and train model model = Word2Vec(bigrams[training_data], size=args.embedding_dim) word_list = list(model.wv.vocab.keys()) vector_list = [model[word] for word in word_list] # the basic model doesnt seem to be supporting item assignment # but WordEmbeddingsKeyedVectors does kv = WordEmbeddingsKeyedVectors(args.embedding_dim) kv.add(word_list, vector_list) kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim)) # just to be safe, clear the cache of normalized vectors # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532 del kv.vectors_norm # save the new models bigrams.save(f"{args.model_path}/bigram.model") kv.save(f"{args.model_path}/word2vec.model")
def _n_gram(self, n=3): # Build the bigram and trigram models bigram = Phrases(self.token_list, min_count=5, threshold=10) # higher threshold fewer phrases. trigram = Phrases(bigram[self.token_list], threshold=10) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) if n == 3: self.token_list = [ trigram_mod[bigram_mod[doc]] for doc in self.token_list ] if n == 2: self.token_list = [bigram_mod[doc] for doc in self.token_list]
def trigrams(corpus, output_prefix): print("----- Trigrams -----") if os.path.exists(output_prefix + "_trigram_phrases"): trigram_phrases = Phrases.load(output_prefix + "_trigram_phrases") print("Loaded trigram phrases") else: bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"]) trigram_phrases = Phrases(bigram_phrases[corpus], min_count=CONFIG["trigram_phrase_min_count"], threshold=CONFIG["trigram_phrase_threshold"], delimiter=CONFIG["trigram_phrase_delimiter"]) trigram_phrases.save(output_prefix + "_trigram_phrases") trigram_transformer = Phraser(trigram_phrases) dct = Dictionary(trigram_transformer[corpus]) dct.save(output_prefix + "_dictionary_trigram") print("Training tf-idf from trigrams") bow_corpus = [dct.doc2bow(line) for line in trigram_transformer[corpus]] tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc') tfidf.save(output_prefix + "_tfidf_trigram") print("Training word2vec model with trigram") start_time = time() trigram_model = gensim.models.Word2Vec(trigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'], min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'], negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'], iter=CONFIG['train_epoch']) trigram_model.save(output_prefix + "_trigram") print("Time :", format_time(time() - start_time)) return trigram_model
def bigrams(list_of_list, occ, th): phrases = Phrases(list_of_list, min_count=occ, threshold=th) bigram = Phraser(phrases) for index, sentence in enumerate(list_of_list): list_of_list[index] = bigram[sentence] c = bigram.phrasegrams return list_of_list, c
def gensim_w2v(): ''' w2v using the gensim lib ''' cleaned_corpus = tokenize_corpus() # a Phraser takes a list of lists of words as input phrases = Phrases(cleaned_corpus, min_count=30, progress_per=10) bigram = Phraser( phrases) # construct the bigram object form the extracted phrases sentences = bigram[ cleaned_corpus] # this will construct words like northern california into northern_california n_cores = multiprocessing.cpu_count( ) # count the number of cores in our computer w2vec = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=n_cores - 1) w2vec.build_vocab(sentences) # build the vocab given the sentences w2vec.train(sentences, total_examples=w2vec.corpus_count, epochs=30, report_delay=1) # train emb_matrix = w2vec[w2vec.wv.vocab] # save for viz maybe? mean_vector = np.mean( emb_matrix, axis=0) # this will be used for the <UNK> tokens on test data
def preprocess(self): from nltk import word_tokenize print("Starting to preprocess...") for split in ['train','test']: unigrams = [word_tokenize(sentence[0]) for sentence in self.data[split].values] ps = PorterStemmer() for idx,review in enumerate(unigrams): stemmedSentence=[] for word in review: #stemmedSentence.append(ps.stem(word)) # stemming takes too long ... stemmedSentence.append(word) self.data[split].iloc[idx,0]=" ".join(stemmedSentence) bigrams = Phrases(unigrams, min_count=2) bigram_phraser = Phraser(bigrams) if self.representation == 'GloVe': # let X be a list of tokenized texts (i.e. list of lists of tokens) self.word_model = gensim.models.Word2Vec(bigram_phraser[unigrams], min_count=1) self.w2v = dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0)) elif self.representation == 'fasttext': self.word_model = FastText(bigram_phraser[unigrams], min_count=1) self.w2v=dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0)) print("Finished preprocessing.")
def run(self): data = self._load_file_data(self.filename) handler = EpochCallbackHandler(self.iter, self.signals.Progress, self.signals.ProgressBar) # sg тип алгоритма для тренировки 0 - CBOW, 1 - skip-gram self.model = Word2Vec(size=self.size, alpha=self.learn_rate, sg=self.sg, min_count=self.min_count, iter=self.iter, window=self.window, ns_exponent=self.ns_exponent, negative=self.negative, workers=4, callbacks=[handler]) self.signals.ProgressBar.emit(10) phrases = Phrases(data, min_count=self.min_count + 10, progress_per=10000) bigram = Phraser(phrases) sentences = bigram[data] self.model.build_vocab(sentences, progress_per=10000) self.signals.PrintInfo.emit('Словарь Word2Vec создан.') self.signals.PrintInfo.emit( 'Тренируем модель Word2Vec {0} эпох.'.format(self.iter)) self.model.train(sentences, total_examples=self.model.corpus_count, epochs=self.iter, report_delay=1) self.signals.PrintInfo.emit('Модель Word2Vec прошла обучение.') self.model.callbacks = () self.signals.PrintInfo.emit('Расчеты закончены!') self.signals.Finished.emit() self.signals.ProgressBar.emit(100)
def get_bigram_phraser(directory): if os.path.isfile(BIGRAM): return Phraser.load(BIGRAM) else: bigram = Phraser(Phrases(corpus(directory))) bigram.save(BIGRAM) return bigram
def generate_sent_tokens(corpus, n_ngrams): punctuations = set(string.punctuation).union(set(("``", "''"))) tokenized_corpus = [] for text in corpus: tok_text = word_tokenize(text.lower()) clean_text = ' ' for word in tok_text: if word not in punctuations and not check_2(word): clean_text += word + ' ' tokenized_sentences = list(map(list, (ngrams(clean_text.split(), n_ngrams)))) if len(tokenized_sentences) == 0: tokenized_sentences = [clean_text.split()] tokenized_corpus.extend(tokenized_sentences) # Phrase Detection # Give some common terms that can be ignored in phrase detection # For example, 'state_of_affairs' will be detected because 'of' is provided here: common_terms = ["of", "with", "without", "and", "or", "the", "a"] # Create the relevant phrases from the list of sentences: phrases = Phrases(tokenized_corpus, common_terms=common_terms) # The Phraser object is used from now on to transform sentences bigram = Phraser(phrases) # Applying the Phraser to transform our sentences is simply tokenized_corpus = list(bigram[tokenized_corpus]) return tokenized_corpus
def _train_phraser(self, min_count, phrase_threshold, delimiter): print("Training collocation detector...") return Phraser( Phrases(self.line_iterator, min_count=min_count, threshold=phrase_threshold, delimiter=delimiter))
def get_bigram_list(full_sentence_list, stem=False): sentence_stream = [doc.split(" ") for doc in full_sentence_list] #print(sentence_stream) stemmer = RafiStemmer() bigram = Phrases(sentence_stream, min_count=2, threshold=5, delimiter=b'_') bigram_phraser = Phraser(bigram) bigram_list = [] #print(bigram_phraser) for sent in sentence_stream: tokens_ = bigram_phraser[sent] for each_bigram in tokens_: if each_bigram.count('_') == 1: #print(each_bigram) if stem == True: bigram_list.append(stemmer.stem_word(each_bigram)) else: bigram_list.append(each_bigram) bigram_count_list = [] for each_unique_bigram in set(bigram_list): bigram_count_list.append( [each_unique_bigram, bigram_list.count(each_unique_bigram)]) return (bigram_count_list)
def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise
def create_dictionary_and_corpus(documents): bigram = Phrases(documents, min_count=20, threshold=20) bigram_model = Phraser(bigram) arr = [bigram_model[d] for d in documents] dic = Dictionary(arr) corpus = [dic.doc2bow(text) for text in arr] return dic, corpus
def collocation(in_path): """Creates corpus considering collocations, frequent co-occuring bigrams are merged (new york -> new_york)""" corpus = LineSentence(in_path) bigram = Phraser(Phrases(corpus)) collocation_corpus = bigram[corpus] for sentence in collocation_corpus: print(' '.join(sentence))
def main(dump_file, corpus_file, out_file, phrase, **kwargs): logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) start_time = time.time() with open(corpus_file, 'w') as f: wiki_corpus = WikiCorpus(dump_file, lemmatize=False, dictionary={}) for text in wiki_corpus.get_texts(): f.write(' '.join(text) + '\n') corpus_time = time.time() print('Elapsed: %d seconds' % (corpus_time - start_time)) if phrase: phraser = Phraser(Phrases(LineSentence(corpus_file))) sentences = phraser[LineSentence(corpus_file)] else: sentences = LineSentence(corpus_file) model = Word2Vec(sentences, sg=1, **kwargs) model.save(out_file) now = time.time() print('Total: %d seconds' % (now - start_time)) print('Preprocess: %d seconds' % (corpus_time - start_time)) print('Train: %d seconds' % (now - corpus_time))
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi', commonfile='common_tagged.txt'): """ :param source_file: :param outfile: :param mincount: :param threshold: :param scoring: :param commonfile: :return: """ common = set([word.strip() for word in open(commonfile, 'r').readlines()]) data = LineSentence(source_file) bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold, scoring=scoring, max_vocab_size=400000000, delimiter=b':::', progress_per=100000, common_terms=common) bigrams = Phraser(bigram_transformer) tempfile = smart_open(outfile, 'a') print('Writing bigrammed text to %s' % outfile, file=sys.stderr) for i in bigrams[data]: tempfile.write(' '.join(i) + '\n') tempfile.close() return len(bigrams.phrasegrams)
def get_clusters(df): df = df.drop_duplicates().reset_index(drop=True) nlp = spacy.load('en') df['text_processed'] = df.text.apply( lambda x: ' '.join(word for word in simple_preprocess(x))) df['tokens'] = df.text_processed.apply( lambda x: ' '.join(token.text for token in nlp.tokenizer(x))) texts = [row.split() for row in df.tokens] bigram = Phrases(texts) bigram_model = Phraser(bigram) texts = [bigram_model[doc] for doc in texts] documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)] doc2vec = Doc2Vec(workers=4, seed=23) doc2vec.build_vocab(documents) for epoch in tqdm(range(10)): doc2vec.train(documents, total_examples=doc2vec.corpus_count, epochs=1) doc2vec.alpha -= 0.0002 doc2vec.min_alpha = doc2vec.alpha X = np.array([doc2vec.infer_vector(text) for text in texts]) model = KMeans(n_clusters=15, n_jobs=-1) model.fit(X) return model, X, texts
def load_vector_data(dataset_name, bgr=False): sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",").astype(str).fillna("").values.tolist() targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv", delimiter=",", dtype=types).astype(str)["a"].tolist() vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext", binary=True) # replace placeholders (" "), make one-string-sentences for index, sample in enumerate(sentences): sentences[index] = list(filter((" ").__ne__, sample)) inputs = [" ".join(sentence) for sentence in sentences] sentences if bgr: tokenized = [t.split() for t in inputs] phrases = Phrases(tokenized) bigram = Phraser(phrases) bigrammed = [] # make bigrams for inputs for sentence in inputs: sentence = [t.split() for t in [sentence]] bigrammed.append(bigram[sentence[0]]) inputs = [] for sent in bigrammed: inputs.append(np.sum(vector_model.wv[sent], 0).tolist()) if sent else inputs.append(np.zeros(32)) else: inputs = [vector_model.wv[sample] for sample in inputs] inputs = np.array(inputs) train_x, test_x, train_y, test_y = train_test_split(inputs, targets, test_size=0.2) return train_x, test_x, train_y, test_y
def build_ngram(walks, ngram, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', scoring='default'): """ Compose n-gram on the fly given tunable parameters, work for both in-memory or out-of-core computations. Required Parameters - walks: iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object) Input random walk sequences. Can be either 'List of list of tokens'(in-memory) or 'deepwalk.walks.WalksCorpus' object(out-of-core) - ngram: int Specify the n of n-gram, e.g.: ngram=2 to compose bigrams. Optional Parameters Referece to gensim.models.phrases.Phrases Return - iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object) """ if ngram<2: logger.warning("ngram must >=2! Skip building ngram.") return walks ngram_phrasers = [] for n in range(2,ngram+1): logger.info("Composing "+str(n)+"-grams...") ngram_phrases = Phrases(walks, min_count=min_count, threshold=threshold, max_vocab_size=max_vocab_size, delimiter=delimiter, scoring=scoring) ngram_phraser = Phraser(ngram_phrases) walks = ngram_phraser[walks] ngram_phrasers.append(ngram_phraser) return walks, ngram_phrasers
def make_bigram(text_csv_file, out_file_name, out_vocab_file_name, min_count=50, threshold=10): df = pd.read_csv(text_csv_file, index_col=0) all_tokens = [] for text in df['Text']: #all_tokens.append(word_tokenize(text)) all_tokens.append(word_tokenize(text.replace(".",""))) bigram = Phrases(all_tokens, min_count=min_count, threshold=threshold, delimiter=b'_') bigram_phraser = Phraser(bigram) new_texts = [] for tokens in all_tokens: new_texts.append(' '.join(str(x) for x in bigram_phraser[tokens])) df_meta = df.loc[:, df.columns != 'Text'] df = pd.DataFrame(data={'Text': new_texts}) df = pd.concat([df, df_meta], axis=1) df.index = np.arange(len(df)) print(df.shape) counter = countWordsOnTexts(df) #print(counter) print('Bigram Vocabulary size: ' + str(len(counter))) f = open(out_vocab_file_name, 'w') f.write(repr(counter)) f.close() df.to_csv(out_file_name) return df
def pre_process_pipeline(corpus, bigram_min_count=25, bigram_threshold=10, infreq_threshold=25): # Tokenize corpus corpus = list(sent_to_words(corpus)) # Remove Stop Words corpus = remove_stopwords(corpus) # Find and replace empirically modelled bigrams bigram = Phrases(corpus, min_count=bigram_min_count, threshold=bigram_threshold) # More efficient method to find and replace bigrams bigram_mod = Phraser(bigram) corpus = make_bigrams(corpus, bigram_mod) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv corpus = lemmatization(corpus, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Remove infrequent tokens corpus, infreq_vocab = remove_infrequent(corpus, threshold=infreq_threshold) # Remove Stop Words - 2nd time corpus = remove_stopwords(corpus) return corpus