def clean_text(df_input, col='content', remove_unusual=False, remove_stopwords=False, toRemove=[], remove_numbers=False, stem_words=False, lemmatize=False, nGram=False): # Clean mails if remove_stopwords: toRemove.extend(stopWords()) usual_words = [] if remove_unusual: usual_words = usualWords() ## Clean content of mails # tokenization and lemmatization/stemming df_input[col] = df_input[col].map( lambda x: text_to_words(x, remove_numbers, stem_words, lemmatize)) # removing stopwords and unusual words df_input[col] = df_input[col].map(lambda x: remove_words( x, remove_unusual, remove_stopwords, toRemove, usual_words)) # bigrams and trigrams if nGram: phrase = phrases.Phrases(df_input[col], min_count=30, threshold=300) bigram = phrases.Phraser(phrase) trigram = phrases.Phrases(bigram[df_input[col]]) df_input[col] = [trigram[bigram[sent]] for sent in df_input[col]] print("data cleaned") return df_input
def main(args): """Reads csv data file containing sentences, tokenizes and uses them to train word2vec model""" data = pd.read_csv(args['data_csv'], index_col=0) # tokenize and preprocess sentences sentences = [ stripword(row.translate(translator).lower).split(' ') for row in data['Sentence'] ] # create bigrams to capture word combinations (e.g. New_York) bigram_transformer = phrases.Phrases(sentences) bigram = phrases.Phraser(bigram_transformer) # train word2vec model according to the hyperparameters chosen currentmodel = Word2Vec(bigram[sentences], workers=-1, sg=0, size=args['model_size'], min_count=5, window=['window_size'], sample=1e-3) currentmodel.init_sims(replace=True) currentmodel.save("app/word2vec/word2vec_retrained") print('Saved as app/word2vec/word2vec_retrained')
def make_bigrams(data_words, texts): from gensim.models import Phrases from gensim.models import phrases bigram = Phrases(data_words, min_count=5, threshold=100) bigram_mod = phrases.Phraser(bigram) return [bigram_mod[doc] for doc in texts]
def train_phrase(): sentence_stream = list() for doc in documentList: wordlist = doc.split(" ") sentence_stream.append(wordlist) ps = phrase.Phrases(sentence_stream) bigram = phrase.Phraser(ps) return bigram
def phrase_detection(df): """ Given the emails dataframe, form bigrams based on the text in "Body" field """ sentences = [text.split() for text in df["Body"]] phrases_ = phrases.Phrases(sentences, min_count=params.bigrams_min_count, threshold=params.bigrams_threshold) bigram = phrases.Phraser(phrases_) # for phr, score in phrases_.export_phrases(sentences): # print(u'{0} {1}'.format(phr, score)) return bigram
def make_bigram(dirpaths): sentences = corpora(dirpaths, loop_or_not=False) print('Start phrasing:') phrase = phrases.Phrases(sentences, max_vocab_size=DICTLENGTH, min_count=1, threshold=5, common_terms={'of', 'and', 'the', 'with'}) bigram = phrases.Phraser(phrase) bigram.save(SAVED_BIGRAM_PATH) print('bigram phraser saved conclude.')
def __init__(self, model_path, create=False, corpus=None, bigrams=True): """ Initializes the rewriter, given a particular Word2Vec corpus. A good example corpus is the Wikipedia Text8Corpus. You only need the corpus if you are recreating the model from scratch. If ``create == True``, this generates a new Word2Vec model (which takes a really long time to build.) If ``False``, this loads an existing model we already saved. :param str model_path: where to store the model files. This file needn't exist, but its parent folder should. :param bool create: True to create a new Word2Vec model, False to use the one stored at ``model_path``. :param Iterable corpus: only needed if ``create=True``. Defines a corpus for Word2Vec to learn from. :param bool bigrams: only needed if ``create=True``. If True, takes some more time to build a model that supports bigrams (e.g. `new_york`). Otherwise, it'll only support one-word searches. ``bigram=True`` makes this slower but more complete. """ self.model_path = model_path # TODO: add logic around defaulting to creating or not if create: # generate a new Word2Vec model... takes a while! # TODO optimize parameters transformed_corpus = None if bigrams: # TODO save the phraser somewhere... but that requires # even more arguments. # the Phrases class lets you generate bigrams, but the # Phraser class is a more compact version of the same # TODO making the phrases takes forever, making the phraser # takes forever, turning it into a list takes forever... this # is really annoying. is there any way to speed it up? bigram_generator = phrases.Phraser(phrases.Phrases(corpus)) # weird bug where the bigram generator won't work unless # it's turned into a list first. if you try to do it straight, # it'll give you total gibberish. FIXME bigram_corpus = list(bigram_generator[corpus]) transformed_corpus = bigram_corpus else: # no bigrams, same old corpus transformed_corpus = corpus self.model = word2vec.Word2Vec(transformed_corpus, workers=8) self.model.save(self.model_path) else: self.model = word2vec.Word2Vec.load(self.model_path)
def word_modeling(tokens): from gensim.corpora import Dictionary from gensim.models import phrases, LdaModel bigram = phrases.Phraser(phrases.Phrases(tokens, min_count=2)) for i, ts in enumerate(tokens): for btoken in bigram[ts]: if '_' in btoken and btoken not in tokens[i]: tokens[i].append(btoken) token_dict = Dictionary(tokens) corpus = [token_dict.doc2bow(t) for t in tokens] _ = token_dict[0] model = LdaModel(corpus=corpus, id2word=token_dict.id2token, chunksize=len(tokens), alpha="auto", eta="auto", iterations=400, num_topics=20, passes=20, eval_every=None) pprint.pprint(model.top_topics(corpus))
def addSentence(self, sentence): try: #f=open("w2v_"+self.name,"r") model = gensim.models.KeyedVectors.load_word2vec_format("w2v_" + self.name) weights = model.syn0 except FileNotFoundError: print(len(sentence)) ph = phrases.Phrases(sentence) bigram_transformer = phrases.Phraser(ph) trigram = phrases.Phrases(bigram_transformer[sentence]) ngram = phrases.Phrases(trigram[sentence]) #ngram=phrases.Phrases(trigram[bigram_transformer[sentence]]) model = Word2Vec(ngram[trigram[bigram_transformer[sentence]]], size=40000, window=5, min_count=1, workers=4, sg=0, iter=80) model.wv.save_word2vec_format("w2v_" + self.name) #print(sentence[1:10]) #print("Fresh :",model["fresh"]) #print("ताजा :",model["ताजा"]) weights = model.wv.syn0 #print(weights) np.save(open("embed" + self.name + ".txt", 'wb'), weights) vocab = dict([(k, v.index) for k, v in model.vocab.items()]) with open("vocab" + self.name + ".txt", 'w', encoding='utf-8') as f: f.write(json.dumps(vocab)) with open("vocab" + self.name + ".txt", 'r', encoding='utf-8') as f: data = json.loads(f.read()) self.word2index = data self.index2word = dict([(v, k) for k, v in data.items()]) self.n_words = len(model.vocab) print(self.name + ":", self.n_words)
def train(self, data_iterator, **kwargs): # Train the phraser from gensim self.phraser = gensim_phrases.Phraser( gensim_phrases.Phrases(data_iterator, **kwargs))
def save(self, kind, bigrams=True): print('Initializing split word phraser') stream = self.stream('sentences', 'list') split_word_model = phrases.Phrases(self.stream('sentences', 'list')) # first, reunite words that shouldn't be split; # remove all bigrams that don't merge into a real word split_word_phraser = phrases.Phraser(split_word_model) for word_tuple in list(split_word_phraser.phrasegrams.keys()): if not word_tuple[0] + word_tuple[1] in nlp.vocab: del split_word_phraser.phrasegrams[word_tuple] # we don't want the merged words to have a delimiter in them split_word_phraser.delimiter = b'' if bigrams is True: print('Initializing bigram phraser') # now we actually look for bigrams stream = self.stream('sentences', 'list') bigram_model = phrases.Phrases(split_word_phraser[stream]) # this phraser will catch bigrams that are very unique but less bigram_model.min_count = 20 bigram_model.threshold = 90 bigram_phraser_threshold = phrases.Phraser(bigram_model) # this one will catch bigrams that are less unique but very common bigram_model.min_count = 70 bigram_model.threshold = 60 bigram_phraser_count = phrases.Phraser(bigram_model) if kind == 'documents': save_path = self.save_dir.joinpath('line_documents.txt') elif kind == 'sentences': sp.call(['rm -rf {}/line_sentences'.format(self.save_dir.name)], shell=True) save_dir = self.save_dir.joinpath('line_sentences') save_dir.mkdir(exist_ok=True) for i, tokenized_text in enumerate(self.stream('documents', 'spacy')): print('Writing {} in line-{} format'.format( self.raw_paths[i].name, kind)) if kind == 'sentences': save_path = save_dir.joinpath(self.raw_paths[i].name + '.txt') if kind == 'documents': document_tokens = [] with save_path.open('a') as save_file: for sentence in tokenized_text.sents: sentence_tokens = [] for token in sentence: if token.pos_ in ['PROPN', 'NUM']: sentence_tokens.append(token.pos_) elif token.is_alpha and token.is_ascii and not token.is_oov: sentence_tokens.append(token.text) sentence_tokens = split_word_phraser[sentence_tokens] if bigrams is True: sentence_tokens = bigram_phraser_threshold[ sentence_tokens] sentence_tokens = bigram_phraser_count[sentence_tokens] if kind == 'sentences': sentence_string = ' '.join(sentence_tokens) if len(sentence_string) > 0: save_file.write(sentence_string + '\n') if kind == 'documents': document_tokens += sentence_tokens if kind == 'documents': document_string = ' '.join(document_tokens) save_file.write(document_string + '\n')
sentences_copy = sentences threshold = 8.0 print("Beginning multi-gram accumulation.") # I want to keep making larger and larger n-grams until I think # there are no more to be made. while True: bigram = Phrases(sentences_copy, threshold=threshold) bigrams = bigram.export_phrases(sentences_copy) z = list(set(bigrams) - set(unigrams)) if len(z) == 0 or threshold > 12: break else: gram_bigram = gmp.Phraser(bigram) sentences_copy = gram_bigram[sentences_copy] unigrams = bigrams grams.append(gram_bigram) threshold += 1 # Maybe there's a more elegant solution to this, but this alters # the Keras code in a minimal way. def gram_er(sentences): temp = sentences for g in grams: temp = g[temp] return temp num_words = len(set([i for k in sentences for i in k]))