Пример #1
0
def prepare_vector(df):

    """
    Applies all the method above to clean the data, trains and computes the word2vec mapping as well as the
    TF-IDF vectorizer on the same tokens as word2vec to be able to apply get_mean_vectors_idf
    """

    df_text_abstract_split = prepare_series_text_split(df['abstract'])

    phrases = Phrases(df_text_abstract_split)
    bigram = Phraser(phrases)
    listsentences = df_text_abstract_split.values.tolist()
    model = gensim.models.Word2Vec(df_text_abstract_split, size=128, window=8, min_count=1, workers=4)
    model.train(bigram[listsentences],total_examples=len(df_text_abstract_split),epochs=10)
    word2vec = model.wv

    tvec_full = TfidfVectorizer(analyzer='word',
                                    tokenizer=dummy_fun,
                                    preprocessor=dummy_fun,
                                    token_pattern=None,min_df=.0025, max_df=0.4)

    tvec_full.fit(bigram[listsentences])

    idf_weighted_vectors = get_mean_vectors_idf(bigram[listsentences],word2vec,tvec_full)

    df_vectorized = pd.DataFrame(idf_weighted_vectors, index=df_text_abstract_split.index)




    return df_vectorized
Пример #2
0
 def _phrase(self, token):
     bigram = Phrases(token, min_count=5, threshold=100)
     bigram_mod = Phraser(bigram)
     # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100)
     # trigram_mod = Phraser(trigram)
     # return [trigram_mod[bigram_mod[doc]] for doc in token]
     return [bigram_mod[doc] for doc in token]
Пример #3
0
def email2phrases(email_contents):
    phrased_input = []
    bi_gram = Phrases(email_contents, min_count=1, threshold=1)
    bg_phraser = Phraser(bi_gram)
    for sentence in email_contents:
        phrased_input.append(bg_phraser[sentence])
    return phrased_input
Пример #4
0
def build_phrases(sentences):
    phrases = Phrases(
        sentences,
        min_count=2,
        threshold=10,
    )
    return Phraser(phrases)
 def __init__(self, df):
     self.sent = df.tolist()
     self.phrases = Phrases(self.sent, min_count=30, threshold=1)
     self.bigram = Phraser(self.phrases)
     self.sentences = self.bigram[self.sent]
     self.w2v_model = Word2Vec(min_count=30, window=3, size=252, sample=6e-5, alpha=0.01,   # sample=1e-5
                               min_alpha=0.0005, negative=5, workers=multiprocessing.cpu_count()-1)
def createEmbeddingSpace(filename):
    # you need to remake key common phrases...
    # "new york" should really be "new_york" as a collective since "new" and "york" have different meanings
    # if they are used together vs separately

    # https://stackoverflow.com/questions/35716121/how-to-extract-phrases-from-corpus-using-gensim

    #sentencesAll = []
    with open(filename, 'r') as f:

        sentencesAll = [line.split(" ") for line in f if line != None]

    #takes about ~10 min
    random.shuffle(sentencesAll)

    phrases = Phrases(sentencesAll,
                      min_count=1,
                      threshold=2,
                      progress_per=10000)
    bigram = Phraser(phrases)
    sentences = bigram[sentencesAll]

    print(len(sentences))  #15,786,808
    print(sentences[0])

    # Building and Training the Model
    cores = multiprocessing.cpu_count()

    # I removed min_count... idk how to see which we not used
    w2v_model = Word2Vec(window=6,
                         size=100,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores - 1)

    t = time()

    w2v_model.build_vocab(sentences, progress_per=10000)

    print('Time to build vocab: {} mins'.format(round((time() - t) / 60,
                                                      2)))  #6.71 mins

    t = time()

    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)

    print('Time to train the model: {} mins'.format(round((time() - t) / 60,
                                                          2)))

    print("Sentence[0]: in embedding Model {}".format(sentences[0]))
    print("Sentence[1]: in embedding Model {}".format(sentences[1]))
    print("Similarity is: {}".format(
        w2v_model.wv.wmdistance(sentences[0], sentences[1])))

    return w2v_model
Пример #7
0
def preprocess(segments, dct=None, bigram=None):
    processed_segments = []
    for seg in segments:
        processed_seg = []
        for word in seg:
            if True in [word.is_space, word.is_stop, word.is_punct]:
                continue
            word = word.lemma_
            word = word.lower()
            processed_seg.append(word)
        processed_segments.append(processed_seg)

    if bigram is None:
        phrases = Phrases(processed_segments, min_count=3, threshold=3)
        bigram = Phraser(phrases)

    processed_segments = bigram[processed_segments]

    if dct is None:
        dct = Dictionary(processed_segments)
    else:
        dct.add_documents(processed_segments)

    return [dct.doc2bow(line)
            for line in processed_segments], dct, processed_segments, bigram
Пример #8
0
def preprocess(data, ngrams=False):
	'''
		Input:
			data - List of articles/titles
			ngrams - Train own n-grams using Gensim's Phraser or incorporate pretrained (applies to Google News Word2Vec)
		Output:
			List of tokenized words for each title/article
	'''

	#remove links
	processed_data = [re.sub(r'^https?:\/\/.*?[\r\n\s]+', '', article, flags=re.MULTILINE) for article in data]

	#remove punctuation
	#tokenize by word
	tokenizer = RegexpTokenizer(r'\w+')
	processed_data = [tokenizer.tokenize(article) for article in processed_data]

	#remove stopwords
	stop_words = stopwords.words('english')
	rm_stop = [[word for word in article if word.lower() not in stop_words] for article in processed_data]

	#incorporate bigrams and trigrams
	if ngrams:
		bigram = Phrases(rm_stop, min_count=5, threshold=10)
		trigram = Phrases(bigram[rm_stop], threshold=10)

		trigram_mod = Phraser(trigram)
		with_trigram = [trigram_mod[article] for article in rm_stop]

		return with_trigram

	return rm_stop
Пример #9
0
    def __entrenar_trigramas__(self, set_entrenamiento):
        if self.bigramas == None:
            return
        oraciones_con_bigramas = self.bigramas[set_entrenamiento]

        trifrases = Phrases(oraciones_con_bigramas, min_count=5, threshold=1, progress_per=10000)
        self.trigramas = Phraser(trifrases)
Пример #10
0
def train(args):
    # Output during training
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # use text8 corpus as training data, haikus dont provide sufficient context
    training_data = api.load('text8')

    # use the phrase model to recognize bigrams like "White House" or "Climate Change"
    bigram_model = Phrases(training_data)
    # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    bigrams = Phraser(bigram_model)

    # # create and train model
    model = Word2Vec(bigrams[training_data], size=args.embedding_dim)

    word_list = list(model.wv.vocab.keys())
    vector_list = [model[word] for word in word_list]

    # the basic model doesnt seem to be supporting item assignment
    # but WordEmbeddingsKeyedVectors does
    kv = WordEmbeddingsKeyedVectors(args.embedding_dim)
    kv.add(word_list, vector_list)

    kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim))

    # just to be safe, clear the cache of normalized vectors
    # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532
    del kv.vectors_norm

    # save the new models
    bigrams.save(f"{args.model_path}/bigram.model")
    kv.save(f"{args.model_path}/word2vec.model")
Пример #11
0
    def _n_gram(self, n=3):
        # Build the bigram and trigram models
        bigram = Phrases(self.token_list, min_count=5,
                         threshold=10)  # higher threshold fewer phrases.
        trigram = Phrases(bigram[self.token_list], threshold=10)

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = Phraser(bigram)
        trigram_mod = Phraser(trigram)

        if n == 3:
            self.token_list = [
                trigram_mod[bigram_mod[doc]] for doc in self.token_list
            ]
        if n == 2:
            self.token_list = [bigram_mod[doc] for doc in self.token_list]
Пример #12
0
def trigrams(corpus, output_prefix):
    print("----- Trigrams -----")
    if os.path.exists(output_prefix + "_trigram_phrases"):
        trigram_phrases = Phrases.load(output_prefix + "_trigram_phrases")
        print("Loaded trigram phrases")
    else:
        bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"])
        trigram_phrases = Phrases(bigram_phrases[corpus], min_count=CONFIG["trigram_phrase_min_count"], threshold=CONFIG["trigram_phrase_threshold"], delimiter=CONFIG["trigram_phrase_delimiter"])
        trigram_phrases.save(output_prefix + "_trigram_phrases")
    trigram_transformer = Phraser(trigram_phrases)
    dct = Dictionary(trigram_transformer[corpus])
    dct.save(output_prefix + "_dictionary_trigram")
    print("Training tf-idf from trigrams")
    bow_corpus = [dct.doc2bow(line) for line in trigram_transformer[corpus]]
    tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc')
    tfidf.save(output_prefix + "_tfidf_trigram")
    print("Training word2vec model with trigram")
    start_time = time()
    trigram_model = gensim.models.Word2Vec(trigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'],
                                   min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'],
                                   negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'],
                                   iter=CONFIG['train_epoch'])
    trigram_model.save(output_prefix + "_trigram")
    print("Time :", format_time(time() - start_time))
    return trigram_model
Пример #13
0
def bigrams(list_of_list, occ, th):
    phrases = Phrases(list_of_list, min_count=occ, threshold=th)
    bigram = Phraser(phrases)
    for index, sentence in enumerate(list_of_list):
        list_of_list[index] = bigram[sentence]
    c = bigram.phrasegrams
    return list_of_list, c
Пример #14
0
def gensim_w2v():
    '''
		w2v using the gensim lib
	'''
    cleaned_corpus = tokenize_corpus()

    # a Phraser takes a list of lists of words as input
    phrases = Phrases(cleaned_corpus, min_count=30, progress_per=10)
    bigram = Phraser(
        phrases)  # construct the bigram object form the extracted phrases

    sentences = bigram[
        cleaned_corpus]  # this will construct words like northern california into northern_california

    n_cores = multiprocessing.cpu_count(
    )  # count the number of cores in our computer
    w2vec = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=n_cores - 1)
    w2vec.build_vocab(sentences)  # build the vocab given the sentences
    w2vec.train(sentences,
                total_examples=w2vec.corpus_count,
                epochs=30,
                report_delay=1)  # train

    emb_matrix = w2vec[w2vec.wv.vocab]  # save for viz maybe?
    mean_vector = np.mean(
        emb_matrix,
        axis=0)  # this will be used for the <UNK> tokens on test data
Пример #15
0
	def preprocess(self):
		from nltk import word_tokenize
		print("Starting to preprocess...")
		for split in ['train','test']:
			unigrams = [word_tokenize(sentence[0]) for sentence in self.data[split].values]
			ps = PorterStemmer()
			for idx,review in enumerate(unigrams):
				stemmedSentence=[]
				for word in review:
					#stemmedSentence.append(ps.stem(word)) # stemming takes too long ...
					stemmedSentence.append(word)
				self.data[split].iloc[idx,0]=" ".join(stemmedSentence)

		bigrams = Phrases(unigrams, min_count=2)
		bigram_phraser = Phraser(bigrams)
		if self.representation == 'GloVe':
			# let X be a list of tokenized texts (i.e. list of lists of tokens)
			self.word_model = gensim.models.Word2Vec(bigram_phraser[unigrams], min_count=1)
			self.w2v = dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0))
		elif self.representation == 'fasttext':
			self.word_model = FastText(bigram_phraser[unigrams], min_count=1)
			self.w2v=dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0))


		print("Finished preprocessing.")
Пример #16
0
 def run(self):
     data = self._load_file_data(self.filename)
     handler = EpochCallbackHandler(self.iter, self.signals.Progress,
                                    self.signals.ProgressBar)
     # sg тип алгоритма для тренировки 0 - CBOW, 1 - skip-gram
     self.model = Word2Vec(size=self.size,
                           alpha=self.learn_rate,
                           sg=self.sg,
                           min_count=self.min_count,
                           iter=self.iter,
                           window=self.window,
                           ns_exponent=self.ns_exponent,
                           negative=self.negative,
                           workers=4,
                           callbacks=[handler])
     self.signals.ProgressBar.emit(10)
     phrases = Phrases(data,
                       min_count=self.min_count + 10,
                       progress_per=10000)
     bigram = Phraser(phrases)
     sentences = bigram[data]
     self.model.build_vocab(sentences, progress_per=10000)
     self.signals.PrintInfo.emit('Словарь Word2Vec создан.')
     self.signals.PrintInfo.emit(
         'Тренируем модель Word2Vec {0} эпох.'.format(self.iter))
     self.model.train(sentences,
                      total_examples=self.model.corpus_count,
                      epochs=self.iter,
                      report_delay=1)
     self.signals.PrintInfo.emit('Модель Word2Vec прошла обучение.')
     self.model.callbacks = ()
     self.signals.PrintInfo.emit('Расчеты закончены!')
     self.signals.Finished.emit()
     self.signals.ProgressBar.emit(100)
Пример #17
0
def get_bigram_phraser(directory):
    if os.path.isfile(BIGRAM):
        return Phraser.load(BIGRAM)
    else:
        bigram = Phraser(Phrases(corpus(directory)))
        bigram.save(BIGRAM)
        return bigram
def generate_sent_tokens(corpus, n_ngrams):
    punctuations = set(string.punctuation).union(set(("``", "''")))
    tokenized_corpus = []
    
    for text in corpus:
        tok_text = word_tokenize(text.lower())
        clean_text = ' '
        for word in tok_text:
            if word not in punctuations and not check_2(word):
                clean_text += word + ' '
        tokenized_sentences = list(map(list, (ngrams(clean_text.split(), n_ngrams))))
        if len(tokenized_sentences) == 0:
            tokenized_sentences = [clean_text.split()]
        tokenized_corpus.extend(tokenized_sentences)
    
    # Phrase Detection
    # Give some common terms that can be ignored in phrase detection
    # For example, 'state_of_affairs' will be detected because 'of' is provided here: 
    common_terms = ["of", "with", "without", "and", "or", "the", "a"]
    # Create the relevant phrases from the list of sentences:
    phrases = Phrases(tokenized_corpus, common_terms=common_terms)
    # The Phraser object is used from now on to transform sentences
    bigram = Phraser(phrases)
    # Applying the Phraser to transform our sentences is simply
    tokenized_corpus = list(bigram[tokenized_corpus])
    
    return tokenized_corpus
Пример #19
0
 def _train_phraser(self, min_count, phrase_threshold, delimiter):
     print("Training collocation detector...")
     return Phraser(
         Phrases(self.line_iterator,
                 min_count=min_count,
                 threshold=phrase_threshold,
                 delimiter=delimiter))
Пример #20
0
def get_bigram_list(full_sentence_list, stem=False):
    sentence_stream = [doc.split(" ") for doc in full_sentence_list]
    #print(sentence_stream)
    stemmer = RafiStemmer()

    bigram = Phrases(sentence_stream, min_count=2, threshold=5, delimiter=b'_')

    bigram_phraser = Phraser(bigram)

    bigram_list = []

    #print(bigram_phraser)
    for sent in sentence_stream:
        tokens_ = bigram_phraser[sent]

        for each_bigram in tokens_:
            if each_bigram.count('_') == 1:
                #print(each_bigram)
                if stem == True:
                    bigram_list.append(stemmer.stem_word(each_bigram))
                else:
                    bigram_list.append(each_bigram)

    bigram_count_list = []
    for each_unique_bigram in set(bigram_list):
        bigram_count_list.append(
            [each_unique_bigram,
             bigram_list.count(each_unique_bigram)])

    return (bigram_count_list)
Пример #21
0
 def testSaveLoadNoCommonTerms(self):
     """ Ensure backwards compatibility with old versions of Phrases, before common_terms"""
     bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.common_terms, frozenset())
     # can make a phraser, cf #1751
     phraser = Phraser(bigram_loaded)  # does not raise
     phraser[["human", "interface", "survey"]]  # does not raise
Пример #22
0
def create_dictionary_and_corpus(documents):
    bigram = Phrases(documents, min_count=20, threshold=20)
    bigram_model = Phraser(bigram)
    arr = [bigram_model[d] for d in documents]
    dic = Dictionary(arr)
    corpus = [dic.doc2bow(text) for text in arr]
    return dic, corpus
Пример #23
0
def collocation(in_path):
    """Creates corpus considering collocations, frequent co-occuring bigrams are merged (new york -> new_york)"""
    corpus = LineSentence(in_path)
    bigram = Phraser(Phrases(corpus))
    collocation_corpus = bigram[corpus]
    for sentence in collocation_corpus:
        print(' '.join(sentence))
Пример #24
0
def main(dump_file, corpus_file, out_file, phrase, **kwargs):
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)

    start_time = time.time()

    with open(corpus_file, 'w') as f:
        wiki_corpus = WikiCorpus(dump_file, lemmatize=False, dictionary={})
        for text in wiki_corpus.get_texts():
            f.write(' '.join(text) + '\n')

    corpus_time = time.time()
    print('Elapsed: %d seconds' % (corpus_time - start_time))

    if phrase:
        phraser = Phraser(Phrases(LineSentence(corpus_file)))
        sentences = phraser[LineSentence(corpus_file)]
    else:
        sentences = LineSentence(corpus_file)

    model = Word2Vec(sentences, sg=1, **kwargs)
    model.save(out_file)

    now = time.time()
    print('Total: %d seconds' % (now - start_time))
    print('Preprocess: %d seconds' % (corpus_time - start_time))
    print('Train: %d seconds' % (now - corpus_time))
Пример #25
0
def bigrammer(source_file,
              outfile,
              mincount=100,
              threshold=0.99,
              scoring='npmi',
              commonfile='common_tagged.txt'):
    """
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    :return:
    """
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data,
                                 min_count=mincount,
                                 threshold=threshold,
                                 scoring=scoring,
                                 max_vocab_size=400000000,
                                 delimiter=b':::',
                                 progress_per=100000,
                                 common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = smart_open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    tempfile.close()
    return len(bigrams.phrasegrams)
Пример #26
0
def get_clusters(df):
    df = df.drop_duplicates().reset_index(drop=True)
    nlp = spacy.load('en')
    df['text_processed'] = df.text.apply(
        lambda x: ' '.join(word for word in simple_preprocess(x)))
    df['tokens'] = df.text_processed.apply(
        lambda x: ' '.join(token.text for token in nlp.tokenizer(x)))

    texts = [row.split() for row in df.tokens]
    bigram = Phrases(texts)
    bigram_model = Phraser(bigram)
    texts = [bigram_model[doc] for doc in texts]
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]

    doc2vec = Doc2Vec(workers=4, seed=23)
    doc2vec.build_vocab(documents)

    for epoch in tqdm(range(10)):
        doc2vec.train(documents, total_examples=doc2vec.corpus_count, epochs=1)
        doc2vec.alpha -= 0.0002
        doc2vec.min_alpha = doc2vec.alpha

    X = np.array([doc2vec.infer_vector(text) for text in texts])

    model = KMeans(n_clusters=15, n_jobs=-1)
    model.fit(X)

    return model, X, texts
Пример #27
0
def load_vector_data(dataset_name, bgr=False):
    sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",").astype(str).fillna("").values.tolist()
    targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv", delimiter=",", dtype=types).astype(str)["a"].tolist()
    vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext", binary=True)
    # replace placeholders (" "), make one-string-sentences
    for index, sample in enumerate(sentences):
        sentences[index] = list(filter((" ").__ne__, sample))
    inputs = [" ".join(sentence) for sentence in sentences]
    sentences

    if bgr:
        tokenized = [t.split() for t in inputs]
        phrases = Phrases(tokenized)
        bigram = Phraser(phrases)
        bigrammed = []
        # make bigrams for inputs
        for sentence in inputs:
            sentence = [t.split() for t in [sentence]]
            bigrammed.append(bigram[sentence[0]])
        inputs = []
        for sent in bigrammed:
            inputs.append(np.sum(vector_model.wv[sent], 0).tolist()) if sent else inputs.append(np.zeros(32))
    else:
        inputs = [vector_model.wv[sample] for sample in inputs]

    inputs = np.array(inputs)
    train_x, test_x, train_y, test_y = train_test_split(inputs, targets, test_size=0.2)
    return train_x, test_x, train_y, test_y
Пример #28
0
def build_ngram(walks, ngram, min_count=5, threshold=10.0,
				max_vocab_size=40000000, delimiter=b'_', scoring='default'):
	"""
	Compose n-gram on the fly given tunable parameters, work for both in-memory or out-of-core computations.
	Required Parameters
	- walks: iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object)
			Input random walk sequences. Can be either 'List of list of tokens'(in-memory) or 'deepwalk.walks.WalksCorpus' object(out-of-core)
	- ngram: int
			Specify the n of n-gram, e.g.: ngram=2 to compose bigrams.
	Optional Parameters
	  Referece to gensim.models.phrases.Phrases
	Return
	- iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object)
	"""

	if ngram<2:
		logger.warning("ngram must >=2! Skip building ngram.")
		return walks

	ngram_phrasers = []
	for n in range(2,ngram+1):
		logger.info("Composing "+str(n)+"-grams...")
		ngram_phrases = Phrases(walks, min_count=min_count, threshold=threshold, max_vocab_size=max_vocab_size,
								delimiter=delimiter, scoring=scoring)
		ngram_phraser = Phraser(ngram_phrases)
		walks = ngram_phraser[walks]
		ngram_phrasers.append(ngram_phraser)

	return walks, ngram_phrasers
Пример #29
0
def make_bigram(text_csv_file, out_file_name, out_vocab_file_name, min_count=50, threshold=10):
    df = pd.read_csv(text_csv_file, index_col=0)
    
    all_tokens = []
    
    for text in df['Text']:
        #all_tokens.append(word_tokenize(text))
        all_tokens.append(word_tokenize(text.replace(".","")))
        
    bigram = Phrases(all_tokens, min_count=min_count, threshold=threshold, delimiter=b'_')
    bigram_phraser = Phraser(bigram)
    
    new_texts = []
    for tokens in all_tokens:
        new_texts.append(' '.join(str(x) for x in bigram_phraser[tokens]))
        
    df_meta = df.loc[:, df.columns != 'Text']
    df = pd.DataFrame(data={'Text': new_texts})
    df = pd.concat([df, df_meta], axis=1)
    df.index = np.arange(len(df))
    print(df.shape)
        
    counter = countWordsOnTexts(df)
    #print(counter)
    print('Bigram Vocabulary size: ' + str(len(counter)))
    
    f = open(out_vocab_file_name, 'w')
    f.write(repr(counter))
    f.close()
        
    df.to_csv(out_file_name)
        
    return df
Пример #30
0
def pre_process_pipeline(corpus,
                         bigram_min_count=25,
                         bigram_threshold=10,
                         infreq_threshold=25):

    # Tokenize corpus
    corpus = list(sent_to_words(corpus))

    # Remove Stop Words
    corpus = remove_stopwords(corpus)

    # Find and replace empirically modelled bigrams
    bigram = Phrases(corpus,
                     min_count=bigram_min_count,
                     threshold=bigram_threshold)
    # More efficient method to find and replace bigrams
    bigram_mod = Phraser(bigram)
    corpus = make_bigrams(corpus, bigram_mod)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])
    # Do lemmatization keeping only noun, adj, vb, adv
    corpus = lemmatization(corpus,
                           nlp,
                           allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Remove infrequent tokens
    corpus, infreq_vocab = remove_infrequent(corpus,
                                             threshold=infreq_threshold)

    # Remove Stop Words - 2nd time
    corpus = remove_stopwords(corpus)

    return corpus