Пример #1
0
def fit(n_topics, min_occur, iterations, passes, texts, wc):
    """
    Fit an LDA model of n topics to texts.
    """
    bigrams = models.Phrases(texts, min_count=3)
    trigrams = models.Phrases(bigrams[texts], min_count=3)

    texts = [[w for w in t if (len(w)>1 and wc[w]>min_occur)] for t in texts]

    texts = [text + [x for x in bigrams[text] if '_' in x]
                  + [x for x in trigrams[text] if '_' in x]
             for text in texts]

    dictionary = corpora.Dictionary(texts)
    bows = [dictionary.doc2bow(text) for text in texts]

    lda = LdaModel(bows, id2word=dictionary, num_topics=n_topics,
                   alpha='auto', eta='auto',
                   iterations=iterations,
                   passes=passes,
                   eval_every=1)

    coherence_model_lda = CoherenceModel(model=lda,
                                     texts=texts,
                                     dictionary=dictionary,
                                     coherence='c_v')
    coherence = coherence_model_lda.get_coherence()

    return Namespace(**locals())
def text_pre_processing(csvFile, columnNumberForText):
    # import data-set
    # colNum becomes an index, which should start at 0, and columns in spreadsheets start at 1, so subtract 1 from columnNumberForText
    documents = importColumnFromCSV(fileName=csvFile, colNum=int(columnNumberForText) - 1, header=True)
    print "imported documents..."

    # phrase detection model training
    abstracts = []  # list of abstracts containing a list of words
    for line in documents:
        # tokenize abstract
        tokens = nltk.word_tokenize(remove_non_ascii(line))
        abstracts.append(tokens)

    # create bigram and trigram phrase models
    bigram = models.Phrases(abstracts)
    trigram = models.Phrases(bigram[abstracts])
    print "built bigram and trigram phrase detection models..."

    # text pre-processing tools
    stops = get_stopwords('en')  # stronger stopwords
    STOPS = list(' '.join(str(e).title() for e in stops).split()) # uppercase stopwords
    noNum = re.compile(r'[^a-zA-Z ]')  # number and punctuation remover

    # function that cleans the text
    def clean(text):
        clean_text = noNum.sub(' ', text)               # remove numbers and punctuations
        tokens = nltk.word_tokenize(clean_text)         # tokenize text
        filtered_words = [w for w in tokens if not w in stops]      # filter out lowercase stopwords
        double_filtered_words = [w for w in filtered_words if not w in STOPS]    # filter out uppercase stopwords

        trigrams = trigram[bigram[double_filtered_words]]   # apply the bigram and trigram models to the filtered words
        trigrams_str = ' '.join(str(x) for x in trigrams)   # stringify clean and filtered tokens
        return trigrams_str

    results = []  # create list for storing clean abstracts

    # figure out path for the text corpus
    rawFilePathBase = os.path.basename(csvFile)
    rawFileName = os.path.splitext(rawFilePathBase)[0]
    corpusPath = "../../data/" + rawFileName + "_textCorpus.txt"

    # write list of clean text documents to text corpus file
    with open(corpusPath, 'w') as f:
        print 'Cleaned up text corpus file has been created at ', corpusPath, ' ...'
        f.truncate()        # if file is not empty, remove everything inside the file
        for abstract in documents:
            text = clean(abstract)      # clean each abstract, one at a time
            f.write(text + '\n')        # write clean abstract to desired text corpus file
            results.append(text)        # append clean abstracts to list
    return results, corpusPath          # return a list of clean abstracts
Пример #3
0
def build_phrases(filename='../data/bigrams'):
    """
    This script finds bi-grams in our corpus and stores the results to disk.
    """

    bigram = models.Phrases(MyText())
    bigram.save(filename)
Пример #4
0
 def fit(self, X, y=None):
     """
     Fit the model according to the given training data.
     """
     self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold,
         max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per)
     return self
Пример #5
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        self.gensim_model = models.Phrases(
            sentences=X,
            min_count=self.min_count,
            threshold=self.threshold,
            max_vocab_size=self.max_vocab_size,
            delimiter=self.delimiter,
            progress_per=self.progress_per,
            scoring=self.scoring,
            connector_words=self.connector_words,
        )
        self.phraser = FrozenPhrases(self.gensim_model)
        return self
Пример #6
0
def load_save_word2vec_model(line_words, model_filename):
    # 模型参数
    feature_size = 500
    content_window = 5
    freq_min_count = 3
    # threads_num = 4
    negative = 3  #best采样使用hierarchical softmax方法(负采样,对常见词有利),不使用negative sampling方法(对罕见词有利)。
    iter = 20

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(model_filename):
        model = models.Word2Vec.load(model_filename)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        bigram_transformer = models.Phrases(line_words)
        model = models.Word2Vec(bigram_transformer[line_words],
                                size=feature_size,
                                window=content_window,
                                iter=iter,
                                min_count=freq_min_count,
                                negative=negative,
                                workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc - tic))
        model.save(model_filename)
        # model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")
    return model
Пример #7
0
    def partial_fit(self, X):
        """Train model over a potentially incomplete set of sentences.

        This method can be used in two ways:
            1. On an unfitted model in which case the model is initialized and trained on `X`.
            2. On an already fitted model in which case the X sentences are **added** to the vocabulary.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        if self.gensim_model is None:
            self.gensim_model = models.Phrases(
                sentences=X,
                min_count=self.min_count,
                threshold=self.threshold,
                max_vocab_size=self.max_vocab_size,
                delimiter=self.delimiter,
                progress_per=self.progress_per,
                scoring=self.scoring,
                common_terms=self.common_terms)

        self.gensim_model.add_vocab(X)
        self.phraser = Phraser(self.gensim_model)
        return self
Пример #8
0
    def partial_fit(self, X):
        if self.gensim_model is None:
            self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold,
                max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per)

        self.gensim_model.add_vocab(X)
        return self
    def train_topic(
        self,
        num_topics,
        no_below=1,
        no_above=0.9,
        keep_n=None,
        keep_tokens=None,
        remove_most_freq_n=None,
        bad_tokens=None,
        model="ldamulticore",
        bigrams=True,
        **kwargs,
    ):
        """
        no_below (int|None) – Keep tokens which are contained in at least
        no_below documents.
        no_above (float|None): Keep tokens which are contained in no
        more than no_above documents (fraction of total corpus size,
        not an absolute number).
        keep_n (int|None) – Keep only the first keep_n most frequent
        tokens.
        keep_tokens (iterable of str) – Iterable of tokens that must stay in
        dictionary after filtering.
        remove_most_freq_n (int|None): Remove n most frequent tokens
        model ('ldamulticore'|'lda'|'ldamallet')
        """
        if bigrams is True:
            phrases = models.Phrases(self.tokenlists, delimiter=b" ")
            phraser = models.phrases.Phraser(phrases)
            self.tokenlists = [phraser[tl] for tl in self.tokenlists]

        dictionary = corpora.Dictionary(self.tokenlists)

        if remove_most_freq_n:
            dictionary.filter_n_most_frequent(remove_most_freq_n)
        dictionary.filter_extremes(
            no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=keep_tokens
        )

        bows = [dictionary.doc2bow(tl) for tl in self.tokenlists]

        if bad_tokens:
            dictionary.filter_tokens(
                bad_ids=[dictionary.id2token[tok] for tok in bad_tokens]
            )

        self.bows = bows
        self.dictionary = dictionary
        if model == "ldamulticore":
            self.model = models.LdaMulticore(
                bows, num_topics=num_topics, id2word=dictionary, **kwargs
            )
        if model == "lda":
            self.model = models.LdaModel(
                bows, num_topics=num_topics, id2word=dictionary, **kwargs
            )

        if model == "ldamallet":
            raise ValueError("mallet is not yet implemented")
Пример #10
0
    def transform(self, data):
        """Transform training data."""
        # For gensim we need to tokenize the data and filter out stopwords
        self.tokens = [clean_text(doc, stopwords_) for doc in data]

        # bigrams
        if self.bigrams:
            bigram = models.Phrases(
                self.tokens, min_count=5,
                threshold=100)  # higher threshold fewer phrases.
            bigram_mod = models.phrases.Phraser(bigram)
            self.tokens = make_bigrams(self.tokens, bigram_mod)

        # trigrams
        if self.trigrams:
            bigram = models.Phrases(self.tokens, min_count=5, threshold=100)
            bigram_mod = models.phrases.Phraser(bigram)
            trigram = models.Phrases(bigram[self.tokens], threshold=100)
            trigram_mod = models.phrases.Phraser(trigram)
            self.tokens = make_trigrams(self.tokens, bigram_mod, trigram_mod)

        # lemmatization
        if self.lemmatization:
            # Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
            spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
            # Do lemmatization keeping only noun, adj, vb, adv
            self.tokens = do_lemmatization(
                spacy_nlp=spacy_nlp,
                texts=self.tokens,
                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        # Again remove stopwords after doing lemmatization
        self.tokens = [[token for token in doc if token not in stopwords_]
                       for doc in self.tokens]

        # Build a Dictionary - association word to numeric id
        self.dictionary = corpora.Dictionary(self.tokens)

        # Transform the collection of texts to a numerical form [(word_id, count), ...]
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]

        # tf-idf vectorizer
        if self.tf_idf:
            self._tfidf_model = models.TfidfModel(self.corpus,
                                                  id2word=self.dictionary)
            self.corpus = self._tfidf_model[self.corpus]
Пример #11
0
def collocs(texts):
	prev=texts
	bigram = models.Phrases(texts)
	texts=map(lambda x : bigram[x],texts)
	
	if prev == texts:
		return texts
	else:
		return collocs(texts)
Пример #12
0
    def __init__(self, pdf_file):
        self.extractor = pdf_extractor.PdfExtractor(pdf_file)

        #text scrapping from pdf
        self.raw_corpus, self.pages, self.headers = self.extractor.get_corpus_pages_headers(
        )
        # bpbpbp: pages -> docid
        # bpbpbp: headings -> headings

        # get stop words
        nltk.download("stopwords")
        stoplist = set(stopwords.words('english'))

        # process raw_corpus (ie. list of strings)
        tokenizer = RegexpTokenizer(
            '\w[\w-]*|\d(?:\d|,\d|\.\d)*'
        )  #any word with hyphens or number with decimal/commas
        t_corpus = [tokenizer.tokenize(s.lower())
                    for s in self.raw_corpus]  #tokenized corpus

        # find bigrams (phrases of 2 words)
        bigram_ct = models.Phrases(t_corpus, common_terms=stoplist)

        # for all bigrams in t_corpus, combine into 1 word
        t_corpus = bigram_ct[t_corpus]

        # filter out stopwords from t_corpus
        st_corpus = [[w for w in t_txt if w not in stoplist]
                     for t_txt in t_corpus]  # stop_word_token_corpus

        # # grams
        # ct_ngrams = set((g[1], g[0]) for g in bigram_ct.export_phrases(t_corpus))
        # ct_ngrams = sorted(list(ct_ngrams))
        # print(len(ct_ngrams), "grams with common terms found")
        # # highest scores
        # print(ct_ngrams[-20:])

        # Count word frequencies
        from collections import defaultdict
        frequency = defaultdict(int)
        for text in st_corpus:
            for token in text:
                frequency[token] += 1

        # Only keep words that appear more than once
        processed_corpus = [[token for token in text if frequency[token] > 1]
                            for text in st_corpus]
        dictionary = corpora.Dictionary(processed_corpus)
        bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
        tfidf = models.TfidfModel(bow_corpus)
        self.id_to_score_corpus = tfidf[bow_corpus]  # bpbpbp: vector
        self.processed_corpus = processed_corpus  # bpbpbp:tokens
Пример #13
0
    def bigram(self, threshold=10):
        '''
        Optional Create bigrams.
        
        '''

        #Colocation detector trained on the data
        phrases = models.Phrases(self.tokenised, threshold=threshold)

        bigram = models.phrases.Phraser(phrases)

        self.tokenised = bigram[self.tokenised]

        return (self)
Пример #14
0
 def __init__(self, lang, tokenizer=None, load=True):
     self.lang = lang
     self.tokenizer = tokenizer or Tokenizer(lang)
     dirname = join(nlp_data, lang)
     dict_fname = join(dirname, DICTIONARY_FNAME)
     phrase_fname = join(dirname, PHRASES_FNAME)
     if load and exists(phrase_fname):
         self.phrases = gmodels.Phrases.load(phrase_fname)
     else:
         self.phrases = gmodels.Phrases()
     if load and exists(dict_fname):
         self.dictionary = corpora.Dictionary.load(dict_fname)
     else:
         self.dictionary = corpora.Dictionary()
Пример #15
0
def make_ngram(tokenised_corpus, n_gram=2, threshold=10):
    """Extract bigrams from tokenised corpus
    Args:
        tokenised_corpus (list): List of tokenised corpus
        n_gram (int): maximum length of n-grams. Defaults to 2 (bigrams)
        threshold (int): min number of n-gram occurrences before inclusion
    Returns:
        ngrammed_corpus (list)
    """

    tokenised = tokenised_corpus.copy()
    t = 1
    # Loops while the ngram length less / equal than our target
    while t < n_gram:
        phrases = models.Phrases(tokenised, threshold=threshold)
        bigram = models.phrases.Phraser(phrases)
        tokenised = bigram[tokenised]
        t += 1
    return list(tokenised)
 def make_bigrams(self):
     texts = self.df[self.column_with_text]
     bigram = models.Phrases(texts, min_count=3, threshold=5)
     bigram_mod = models.phrases.Phraser(bigram)
     return [bigram_mod[doc] for doc in texts]
Пример #17
0
def depunct(tokens):
    """Remove punctuations from the text"""
    return [token.translate(None, string.punctuation) for token in tokens]


data = pd.read_csv('data/calvin.csv')
documents = data['quote'].tolist()
stoplist = stopwords.words('english')

sent = [d.lower().split() for d in documents]
texts = [
    depunct(
        [word for word in document.lower().split() if word not in stoplist])
    for document in documents
]
bigrams_model = models.Phrases(texts)
bigrams = list(bigrams_model[texts])
trigrams_model = models.Phrases(bigrams)
trigrams = list(trigrams_model[bigrams])

sent.extend(trigrams)
sent.extend(bigrams)

model = models.Word2Vec()
model.build_vocab(sent)
model.train(sent)

chain = ['calvin', 'tiger', 'hobbes', 'mom']
pprint.pprint([
    k for k, v in model.most_similar(positive=chain, negative=[], topn=50)
    if '_' in k
def train(texts, tokentype='lemma',
          allowed_pos=["NOUN", "ADJ", "VERB", "PROPN"],
          out_path=None):
    '''Run gensim phrase detection, remove empty, keep dates.
    Returns lists of tokens.
    
    Parameters
    ----------
    texts : list
        Assuming that texts have _already_ been preprocessed
        using text_to_x 
        (i.e. tokenization, lemmatization & feature selection)

    tokentype : str
        Either "token" or "lemma"

    allowed_pos : list
        uPOS that will be kept in texts.
        Use stanza tags.

    out_path : str (optional)
        path to a directory, where results will be saved
        (in a child directory).
    '''
    # convert to a nice format
    # keep only "meaningful" POS
    # (i.e. noun, propnoun, adj, verb, adverb)
    texts_filter = []
    for doc in texts:
        allowed_keys = [key for key, value in doc['upos'].items() if value in allowed_pos]
        texts_filter.append([word for key, word in doc[tokentype].items() if key in allowed_keys])

    # initialize phrase detection
    phrases = models.Phrases(texts_filter, delimiter=b" ")
    # find phrases
    phraser = models.phrases.Phraser(phrases)
    # extract texts with phrases detected
    phrase_list = [phraser[tl] for tl in texts_filter]

    # missing any data?
    assert len(texts_filter) == len(phrase_list)

    # put together IDs and documents
    phrase_doc = []
    for (i, tweet) in enumerate(phrase_list):
        d = dict()
        d['id'] = i
        d['text'] = tweet
        phrase_doc.append(d)

    # remove empty tweets
    phrase_doc = [doc for doc in phrase_doc if doc['text']]

    # if saving enabled
    if out_path:
        # check if file extension is specified
        if out_path.endswith('.ndjson'):
            pass
        # add it automatically if not
        else:
            print("Adding file extension (.ndjson)")
            out_path = os.path.join(out_path, '.ndjson')

        # export it
        with open(out_path, 'w') as f:
            ndjson.dump(phrase_doc, f)

    return phrase_doc
Пример #19
0
# upload the pandas dataframe with all info on bills and their text.
bill_df = pd.load('fullbilldetails_dataframe')
documents = bill_df['full_cleaned_text']
aa = bill_df['full_cleaned_text']

for i in range(len(documents)):
    documents[i] = documents[i].replace('-', ' ')
    documents[i] = documents[i].replace('ab ', ' ')
    documents[i] = documents[i].replace('_', ' ')

# playwing with bigrams
doclist = []
for doc in documents:
    doclist.append(doc.split())

bigram = models.Phrases(doclist)
trigram = models.Phrases(bigram[doclist])

stoplist = stop_word_list()
"""
  ------------------------------------------------------------
  Testing bigrams/trigrams.  Bigrams worked best.  0 trigrams added with frequency greater than 1.
  ------------------------------------------------------------
"""

#texts = [[word for word in document.lower().split() if (word not in stoplist and len(word)>2)]
#         for document in documents]

#texts = [[word for word in document if (word not in stoplist and len(word)>2)]
#         for document in doclist]
Пример #20
0
    'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'
]  # only the words having the parts of speech will be included in the dataset
for document in data:  # looping over the documents one by one
    text = []
    doc = nlp(document)  # return a spacy document object for processing
    for w in doc:  # looping over single tokens of the document object
        if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (
                w.pos_ in allowed_pos
        ):  # eliminating stop words, punctuations, numbers and the words whose POS is not included on the list described above using features provided by spacy
            text.append(w.lemma_)  # take only the lemma of the word
    texts.append(
        text
    )  # append the documents one by one to the 'texts' list created earlier

bigram = models.Phrases(
    texts, min_count=1, threshold=1
)  # create a bigram model class using the Phrases class of gensim package and fit it to the dataset
texts = [bigram[data] for data in texts
         ]  # transform and create the actual bigrams off the text
print(texts)

dictionary = corpora.Dictionary(
    texts
)  # create a bag of words model class using the Dictionary class of gensim
corpus = [dictionary.doc2bow(text) for text in texts
          ]  # create the bag of words corpus off the Dictionary class
print(corpus)

lda = models.ldamodel.LdaModel(
    corpus=corpus, id2word=dictionary, num_topics=20, passes=1000
)  # run the LdaModel class from gensim for topic modelling with our bag of words corpus, the dictionary object, the topic parameter and the number of runs the model has to run to finish modelling
Пример #21
0
    return texts_out


#Remove Stopwords
stoplist = stopwords.read().splitlines() 
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in tweetList
]

tweetList = list(sent_to_words(texts))



# Build the bigram and trigram models
bigram = models.Phrases(tweetList, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = models.Phrases(bigram[tweetList], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = models.phrases.Phraser(bigram)
trigram_mod = models.phrases.Phraser(trigram)




texts = make_bigrams(texts)

nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
texts = lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
from gensim import models, matutils
import numpy as np  # array handling
import os, warnings

model_filename = 'D:/MICCAI/MICCAI Corpora/2017/Bigram2017.model'
input_filename = 'D:/MICCAI/MICCAI Corpora/2018/Bigrams2018.txt'
newModel_filename = 'D:/MICCAI/MICCAI Corpora/2018/Bigram2018.model'

# ignore unicode warnings
# (they don't cause any problems, just ugly output from this code)
warnings.filterwarnings('ignore', '.*Unicode.*')

# load existing model from file
print('loading model...')
model = models.Word2Vec.load(model_filename)
print('- done')

new_sentences = models.word2vec.LineSentence(input_filename)
model.build_vocab(new_sentences, update=True)
#model.train(new_sentences)
bigram_transformer = models.Phrases(new_sentences)
model.train(bigram_transformer[new_sentences],
            total_examples=model.corpus_count,
            epochs=model.iter)
model.save(newModel_filename)

# bye
print('all done, thank you!')
Пример #23
0
# Get the number of reviews based on the dataframe column size
num_patents = exampleData["PatentAbstract"].size
 
# Initialize an empty list to hold the clean reviews
clean_abstracts = []
 
# Loop over each review; create an index i that goes from 0 to the length
# of the patent list 
for i in xrange( 0, num_patents ):
    # Call our function for each one, and add the result to the list of
    patent = patent_to_words(exampleData["PatentAbstract"][i])
    array = patent.split()
    clean_abstracts.append(array)

# Identify Bigrams using gensim's Phrases function
bigram = models.Phrases(clean_abstracts)
 
final_abstracts = []
 
for i in xrange(0,num_patents):
    sent = clean_abstracts[i] 
    temp_bigram = bigram[sent]
    final_abstracts.append(temp_bigram)
 
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(final_abstracts)
     
# convert tokenized documents into a document-term matrix (bag-of-words)
corpus = [dictionary.doc2bow(text) for text in final_abstracts]
 
#TF IDF
Пример #24
0
def col1(texts):
	prev=texts
	bigram = models.Phrases(texts)
	texts=map(lambda x : bigram[x],texts)
	
	return texts
Пример #25
0
    os.path.join(path_save_data, 'Recomended_Gram_Tab.csv'))
#
NotRecommended_gramtab = gram_table(
    data=documents[df["Recommended IND"].astype(int) == 1],
    gram=[1, 2, 3],
    length=20)
NotRecommended_gramtab.to_csv(
    os.path.join(path_save_data, 'NotRecomended_Gram_Tab.csv'))

##########################################################
# Creating Bigrams and Trigrams Models, higher threshold fewer phrases
##########################################################
print('Creating Bigrams Model...')
# Build the bigram models
t1 = time.time()
bigram = models.Phrases(texts, min_count=1, threshold=1)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = models.phrases.Phraser(bigram)
print('Time used: %s seconds' % (time.time() - t1))
# See example
print(bigram_mod[texts[9]])
texts_bigram = bigram_mod[texts]
#####################
# save
with open(os.path.join(path_save_data, 'Preprocessed Review Text Bigram.txt'),
          'w') as f:
    for text in texts_bigram:
        for item in text:
            f.write("%s " % item)

        f.write("\n")
        doc = nlp((" ".join(doc)), disable=[
            'ner',
            'tagger',
            'textcat',
        ])
        texts_out.append([tok.lemma_ for tok in doc if tok.lemma_ != '-PRON-'])
    return pd.Series(texts_out)


text_preprocess(reviews_df.reviews.iloc[10:15])

# Commented out IPython magic to ensure Python compatibility.
# %time train_corpus = text_preprocess(reviews_df.reviews)

# create ngrams
ngram_phraser = models.Phrases(train_corpus, threshold=1)
ngram = models.phrases.Phraser(ngram_phraser_1)
# apply n-gram model to corpus
texts_1 = [ngram[token] for token in train_corpus]
# adding it to dataframe
texts_1 = [' '.join(text) for text in texts_1]
reviews_df['ngram'] = texts_1
reviews_df.head()


def createLabelsFromReviewPoints(
    df
):  #this function creates a new column which will be our classification label like low,medium high
    df['class'] = df.apply(lambda row: label_reviews(row), axis=1)
    return df
Пример #27
0
#Tokenising
tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
english_stopwords = get_stopwords('en')
english_stopwords.append('reuters')
english_stopwords.append('said')
token_content = []
processed_content = []
for article in content:
    tokens = tokenizer.tokenize(article.lower())
    token_content.append(tokens)
    stopped_tokens = [i for i in tokens if i not in english_stopwords]
    processed_content.append(stopped_tokens)


# Creating a bigram model
bigram = models.Phrases(token_content, min_count=5, threshold = 100)
bigram_mod = models.phrases.Phraser(bigram)
bigram_content = [bigram_mod[i] for i in processed_content]

# lemmatisation
nlp = spacy.load('en',disable=['parser','ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
lemmatised_content=[]
for each_article in bigram_content:
    try:
        doc = nlp(" ".join(each_article))
        lemmatised_content.append([tokens1.lemma_ for tokens1 in doc if tokens1.pos_ in allowed_postags])
    except:
        print(each_article)

'''
Пример #28
0
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词

    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        if (word not in stopwords and flag[0] in [u'n',u'f',u'a',u'z']): #去停用词和其他词性,比如非名词动词等
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    return result

input = []
for sentence in sentences:
    sentence = delNOTNeedWords(sentence,stopwords)
    input.append(jieba.lcut(sentence))

bigram_transformer = models.Phrases(input)
model = models.Word2Vec(bigram_transformer[input], size=feature_size, window=content_window, min_count=freq_min_count, negative=negative, iter=iter, workers=multiprocessing.cpu_count())
# print model.index2word
model.save(save_filename)
f = model.most_similar([u'奥迪'])
for k in f:
    print k[0].encode('utf-8'),k[1]



"""
    model.most_similar(positive=['woman', 'king'], negative=['man'])
"""
Пример #29
0
    num_row = 0
    for row in data:
        doc_pos = engin.pos(row[0])
        doc_pos = doc_pos_tokenizer(doc_pos)
        doc_info = (row[0], doc_pos)
        doc_list.append(doc_info)
        doc_dic.append(doc_pos)
        num_row += 1
        print("{}번째 문장 명사 개수 : ".format(num_row) + str(len(doc_pos)))
    print("총 문장 개수 : " + str(len(doc_dic)))
    print("head : " + "< " + row_data[0] + " >")
    print('\n' + "=" * 75)

    #n그램모델 https://wikidocs.net/21692
    bigram = models.Phrases(doc_dic, min_count=5, threshold=100)
    trigram = models.Phrases(bigram[doc_dic], threshold=100)
    bigram_mod = models.phrases.Phraser(bigram)
    trigram_mod = models.phrases.Phraser(trigram)

    doc_dic_bi_tri = []

    for i in range(0, len(doc_dic)):
        doc_dic_bi_tri.append(trigram_mod[bigram_mod[doc_dic[i]]])

    dictionary = corpora.Dictionary(doc_dic_bi_tri)
    dictionary.save('dictionary.dict')

    corpus = [
        dictionary.doc2bow(a_doc_dic_bi_tri)
        for a_doc_dic_bi_tri in doc_dic_bi_tri
Пример #30
0
print("Reading data")

data = pd.read_csv(
    "/Users/dmitrys/Yandex.Disk.localized/top_russian_music/comments/union_superreduced_comments.csv"
)
print(data.shape)

data = data[~data.text_bow.isnull()]

print("Cleaning data")
data = data[data['comment_len'] > 40]
texts = data.text_bow.apply(literal_eval)
print(data.shape)
print("Bigrams")
# Build the bigram  models
bigram = models.Phrases(texts, min_count=3,
                        threshold=5)  # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = models.phrases.Phraser(bigram)


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


texts = make_bigrams(texts)

print("Corpora")
dictionary = corpora.Dictionary(texts)  # составляем словарь из терминов
print('Размер словаря до фильтрации: {}'.format(len(dictionary)))