def write_word_frequencies_to_file(tokenized_articles, language_code):
    # Count words using the gensim library
    dictionary = Dictionary(tokenized_articles)
    corpus = [dictionary.doc2bow(article) for article in tokenized_articles]

    total_word_count = defaultdict(int)
    corpus_word_count = 0
    for word_id, word_count in itertools.chain.from_iterable(corpus):
        total_word_count[word_id] += word_count

    corpus_word_count = sum(total_word_count.values())
    sorted_word_count = sorted(total_word_count.items(),
                               key=lambda w: w[1],
                               reverse=True)

    # Write info to file
    dict_size = len(dictionary)
    cummul = 0
    idx = 1
    filename_freq = output_path + 'word_freq_' + language_code + '.txt'
    outputfile_freq = open(filename_freq, 'w', encoding=encoding)
    outputfile_freq.write('\nTotal nb of words: ' + str(dict_size) + '\n')
    for word_id, word_count in sorted_word_count[:min(top_n_words, dict_size)]:
        frac = (100 * word_count / corpus_word_count)
        cummul += frac
        output_string = str(idx) + ';' + dictionary.get(word_id) + ';' + str(
            word_count) + ";" + str(round(frac, 2)) + ';' + str(
                round(cummul, 2)) + '\n'
        outputfile_freq.write(output_string)
        idx += 1

    outputfile_freq.close()
예제 #2
0
def keywords(corpus):
    docs=[preprocess(doc) for doc in corpus]
    dictionary = Dictionary(docs)
    c = [dictionary.doc2bow(doc) for doc in docs]
    tfidf = TfidfModel(c)
    result=[]    
    for s in c:
        tfidf_weights = tfidf[s]
        r=[]
        sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
        for term_id, weight in sorted_tfidf_weights:
            r.append([dictionary.get(term_id), weight])
        result.append(r)
    return result
예제 #3
0
파일: lda.py 프로젝트: AyaRamazanova/LDA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir',
                        default='./data/test_arxiv_plain.txt',
                        help='Path to directory where the data is stored')
    parser.add_argument('--model-dir',
                        default='../model',
                        help='Path to directory where the model is stored')
    parser.add_argument('--train',
                        default=True,
                        help='True for train, False for test mode')
    parser.add_argument('--n_topic', default=20, help='Number of of topics')
    args = parser.parse_args()
    model_dir = './model/model'
    dict_dir = './model/dict.txt'

    if args.train == True:
        print('Reading texts')
        with open(args.data_dir) as f_in:
            texts = f_in.read().split('\n')
        del texts[-1]
        for i in tqdm(range(len(texts))):
            texts[i] = texts[i].split()

        print('Generating corpora')
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary.save_as_text(dict_dir)

        print('Loading model')
        lda = LdaModel(corpus, num_topics=args.n_topic)
        lda.save(model_dir)
    else:
        lda = LdaModel.load(model_dir, mmap='r')
        dictionary = Dictionary()
        dictionary.load_from_text(dict_dir)

    print('Processing results')
    topics = lda.print_topics()
    with open('./report.txt', 'w') as f_out:
        for topic_id, topic_pair in topics:
            print(topic_id, end=': ', file=f_out)
            topic_words = topic_pair.split('"')[1::2]
            topic_words = list(map(int, topic_words))
            topic_words = [dictionary.get(word) for word in topic_words]
            print(topic_words, file=f_out)
예제 #4
0
 def predict_group(self,
                   texts_tokens: List[List[str]],
                   groups_keywords: Dict[str, List[str]] = None,
                   identifiers_number=20) -> List:
     if groups_keywords is None:
         groups_keywords = self.groups_keywords['industry']
     for keys, value in groups_keywords.items():
         groups_keywords[keys] = " ".join(value)
     # Create a Dictionary from the articles: dictionary
     dictionary = Dictionary(texts_tokens)
     # Create a MmCorpus
     corpus = [dictionary.doc2bow(tt) for tt in texts_tokens]
     # Create a new TfidfModel
     tfidf = TfidfModel(corpus)
     tfidf_weights = [
         sorted(tfidf[doc], key=lambda w: w[1], reverse=True)
         for doc in corpus
     ]
     # Create nlp object for each group and keywords
     key_list = list(groups_keywords.keys())
     groups_keywords_docs = []
     for key, value in groups_keywords.items():
         groups_keywords_docs.append(self.nlp(groups_keywords[key]))
     # Predict industry type with similarity method along two nlp object for all texts in industry_type_list
     # Select the most frequent words for each text a and convert list into string
     texts_identifiers = [[
         dictionary.get(term_id)
         for term_id, weight in tfidf_weight[:identifiers_number]
     ] for tfidf_weight in tfidf_weights]
     texts_identifiers = [
         " ".join(text_identifiers)
         for text_identifiers in texts_identifiers
     ]
     groups = []
     for ti in texts_identifiers:
         # Create nlp object based only on text identifiers
         doc_publication = self.nlp(ti)
         similarities = []
         for gkd in groups_keywords_docs:
             similarities.append(doc_publication.similarity(gkd))
         max_value = max(similarities)
         max_position = similarities.index(max_value)
         industry_type = key_list[max_position]
         groups.append(industry_type)
     return groups
예제 #5
0
for text in data['News_content']:
    text_clean.append(text_preprocessing.preprocessText(text))

print(text_clean[:3])

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(text_clean)

# Select the id for "abuse": abuse_id
abuse_id = dictionary.token2id.get("abuse")

print(abuse_id, sep=sp)

# Use abuse_id with the dictionary to print the word
print(dictionary.get(abuse_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in text_clean]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
예제 #6
0
# Lemmatize all tokens into a new list
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in stops_removed]
bow = Counter(lemmatized)
print(bow.most_common(10))


############# gensim
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
tokenized = [word_tokenize(t.lower()) for t in text]
# mapping each of tokens
dictionary = Dictionary(tokenized)
dictionary.token2id
id_computer = dictionary.token2id.get('computer')
print(dictionary.get(id_computer))
# Create a corpus (token ID, frequency)
corpus = [dictionary.doc2bow(a) for a in tokenized]
print(corpus[4][:10])


from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus)
# Calculate the tfidf weights of doc
tfidf_weights = tfidf[doc]
# Sort the weights from highest to lowest
sorted_tfidf_weights = sorted(tfidf_weights, key = lambda w: w[1], reverse=True)
for term_id, weight in sorted_tfidf_weights:
    print(dictionary.get(term_id), weight)

예제 #7
0
        # LDA
        if prior_distrib:
            lda = LdaModel(lda_corpus, num_topics=n_group, alpha=topic_distrib)
        else:
            lda = LdaModel(lda_corpus, num_topics=n_group)

        # Id doc
        algo_group_vec = []
        for id_doc in range(len(token_list_list)):
            topic_per_type = lda.get_document_topics(lda_corpus[id_doc],
                                                     per_word_topics=True)[1]
            type_list = []
            topic_list = []
            for type_topic_elem in topic_per_type:
                type_list.append(lda_voc.get(type_topic_elem[0]))
                topic_list.append(type_topic_elem[1][0])

            algo_group_vec.extend([
                topic_list[type_list.index(token)]
                for token in token_list_list[id_doc]
            ])

        nmi_vec.append(
            normalized_mutual_info_score(real_group_vec, algo_group_vec))

    # Writing results
    nmi_mean = np.mean(nmi_vec)
    nmi_std = np.std(nmi_vec)
    with open(results_file_name, "a") as output_file:
        output_file.write(
예제 #8
0
# Very basic preprocessing. Usually you would do more work here.
tokenized_corpus = [word_tokenize(doc.lower()) for doc in mycorpus]

# Pass to gensim `Dictionary` class. This assigns to each token
# (e.g. word) a unique integer ID. Later on we will just work with
# those IDs instead of the tokens directly because it is
# computationally easier to handle (there is a one-to-one mapping
# between both, so we are not losing any information). The reason why
# we use a dictionary is that it gives us a list of words we are
# interested in examining further. If a word is not in the dictionary
# but occurs in a document, it will be ignored by gensim.
d = Dictionary(tokenized_corpus)
d.token2id  # Like dict(d); show mapping between tokens and their IDs.
d.token2id.get('awesome')  # What's the ID for 'awesome'?
d.get(0)  # What token has ID=0?
d.dfs  # In how many documents does each token appear? (Document frequency).

for i in d:
    print(i, d[i])
# For a single document, we can now calculate the token frequencies
# using the dictionary we just created. "Calculating token
# frequencies" means we're counting words.
#d.doc2bow(tokenized_corpus[2])
#print(d.doc2bow(tokenized_corpus[2]))

# Next, using the dictionary we just created, we build a gensim
# corpus, which is just a bag-of-words representation of the original
# corpus. This is a nested list (a list of lists), where each list
# corresponds to a document. Inside each list we have tuples in the
# form (token_ID, token_frequency). So all we are really doing here is
예제 #9
0
blob = tb(text)
print(blob.polarity)
print(blob.sentiment)

################4. Get Top 5 words #####################
from gensim.corpora.dictionary import Dictionary
words_top = [
    lmtzr.lemmatize(word.lower()) for word in word_tokenize(text)
    if word.isalpha() and word not in custom
]
dictnry = Dictionary([words_top])
corp = [dictnry.doc2bow(article) for article in [words_top]]
text_doc = corp[0]
sorted_doc = sorted(text_doc, key=lambda w: w[1], reverse=True)
for word_id, word_cnt in sorted_doc[:5]:
    print(dictnry.get(word_id), word_cnt)

#####OR#####
count_words = Counter(words_top)
top_count = count_words.most_common(5)
print(top_count)

############# Topics ##############
import gensim
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(corp, num_topics=3, id2word=dictnry, passes=50)
print(ldamodel.print_topics(num_topics=3, num_words=3))

############################5. #####################
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
예제 #10
0
    print("Article " + str(i) + ": " + t[:70])
    for i, t in enumerate(articles_text)
]

# The 12 articles above are all related to business performance. For each of the articles we downloaded the HTML, extracted the text from HTML, cleaned the text, tokenized and lemmatized the tokens. Finally, we combined the result in a list variable. Next, we will create a corpus of these articles, and perform queries on the corpus.

# In[23]:

# Create a Dictionary from the articles
dictionary = Dictionary(articles)

# Select the id for "cost"
cost_id = dictionary.token2id.get("cost")

# Use score_id with the dictionary to print the word
print(dictionary.get(cost_id))

# In[24]:

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the third document
print(corpus[3][:10])

# In[25]:

# Gensim bag-of-words

# Import modules
from collections import defaultdict
#This is a little different than normal corpus is usually a collection of documents.
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
print(corpus)

"""
    gensim models can be easily saved, updated, and reused.
    Our dictionary can also be updated.
    This more advanced and feature rich bag-of-words can be use in future exercises.
"""


# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("movie")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in tokenized_docs]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

# Save the fifth document: doc
doc = corpus[0]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
예제 #12
0
    def news_analyze(self):
        news_file = open('news.txt', 'r')

        read_file = news_file.readlines()
        tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}', discard_empty=True)
        wordList = []
        for line in read_file:
            wordList.append(tokenizer.tokenize(line))

        dict = Dictionary(wordList)
        dict.filter_extremes(no_below=5, no_above=0.20)
        corpus = [dict.doc2bow(text) for text in wordList]

        max_score = 0
        for i in range(1, 25):
            model = ldamodel.LdaModel(corpus=corpus,
                                      num_topics=i,
                                      id2word=dict,
                                      iterations=50)

            coherence_c_v = CoherenceModel(model=model,
                                           texts=wordList,
                                           dictionary=dict,
                                           coherence='c_v',
                                           processes=1)
            model_score = coherence_c_v.get_coherence()
            print(model_score)
            if model_score > max_score:
                max_score = model_score
                temp_file = datapath('model')
                model.save(temp_file)
                number_of_topics = i

        model.load(temp_file)
        print(temp_file)
        for i in range(number_of_topics):
            top_terms = []
            top_probs = []
            for term in model.get_topic_terms(i - 1, topn=15):
                if term[0] in dict:
                    # print(d.get(term[0]))
                    top_terms.append(dict.get(term[0]))
                    top_probs.append(dict.get(term[1]))

            analysis_file = open(
                'C:\\Users\Melchior\PycharmProjects\ThesisV3\\venv\\analysis.txt',
                'a')
            print(top_terms, file=analysis_file)
            analysis_file.close()

        # Counts the number of dominant topics, given 1 article has 1 dominant topic
        topic_distributions = model.get_document_topics(bow=corpus)
        print(topic_distributions)
        dominant_topics = []
        for distrubution in topic_distributions:
            print(distrubution)
            if len(distrubution) > 1:
                dominant_probability = 0
                for probability in distrubution:
                    if probability[1] > dominant_probability:
                        dominant_probability = probability[1]
                        dominant_topic = probability[0]
                dominant_topics.append(dominant_topic)
            else:
                dominant_topics.append(distrubution[0][0])
        print(Counter(dominant_topics))  # x articles have a dominant topic y
예제 #13
0
# Apply tfidf model to our data
# Create a Dictionary 
dictionary = Dictionary(corpus_docs)
# Create a corpus from a bag of words
corpus = [dictionary.doc2bow(doc) for doc in corpus_docs]
# Create a defaultdict
total_word_count = defaultdict(int)
# populate the empty defaultdict with word count from the whole corpus
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count
# instantiate a tfidf model
tfidf = TfidfModel(corpus)
number_of_words_from_every_doc = 1 # you can change this to get more words out of every portion/doc
total_unique_words = {}
for i in range(len(corpus_docs)):
    # weigh a certain document against the corpus
    tfidf_weights = tfidf[corpus][i]
    # Sort the unique words
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
    # Print the top unique word in every portion
    for term_id, weight in sorted_tfidf_weights[:number_of_words_from_every_doc]:
        total_unique_words[dictionary.get(term_id)] = weight
"""
final result
"""
unique_words_descending_wights = sorted(total_unique_words.items(), key=lambda w: w[1], reverse=True)
unique_words = [t[0] for t in unique_words_descending_wights]
print(','.join(unique_words[0:50])) #first 50 words

예제 #14
0
# Print the 10 most common tokens
print(bow.most_common(10))

# Creating and querying a corpus with gensim
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

# Gensim bag-of-words
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
cp = [word_tokenize(doc.lower()) for doc in cp]
cp = [[token for token in doc if token.isalnum() and len(token) > 1]
      for doc in cp]

# Pass to gensim `Dictionary` class. This assigns to each token
# (e.g. word) a unique integer ID. Later on we will just work with
# those IDs instead of the tokens directly because it is
# computationally easier to handle (there is a one-to-one mapping
# between both, so we are not losing any information). The reason why
# we use a dictionary is that it gives us a list of words we are
# interested in examining further. If a word is not in the dictionary
# but occurs in a document, it will be ignored by gensim.
d = Dictionary(cp)
d.token2id  # Like dict(d); show mapping between tokens and their IDs.
d.token2id.get('awesome')       # What's the ID for 'awesome'?
d.get(0)                        # Which token has ID=0?
d.dfs # In how many documents does each token appear? (Document frequency).

# For a single document, we can now calculate the token frequencies
# using the dictionary we just created. "Calculating token
# frequencies" means we're counting words.
d.doc2bow(cp[2])

# Next, using the dictionary we just created, we build a gensim
# corpus, which is just a bag-of-words representation of the original
# corpus. This is a nested list (a list of lists), where each list
# corresponds to a document. Inside each list we have tuples in the
# form
#
# (token_ID, token_frequency).
#
예제 #16
0
# # Select the id for "computer": computer_id
# computer_id = dictionary.token2id.get("computer")

# # Use computer_id with the dictionary to print the word
# print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

for item in corpus:
    print(item)

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

doc = corpus[0]

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)
예제 #17
0
############## Gensim Bag of words ####################################################
import itertools

#   save second document
doc = corpus[0:2]
#print(doc)
#   Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
print(bow_doc)

# print top 4 words of document

for word_id, word_count in itertools.chain.from_iterable(doc):
    #in bow_doc.items():
    print(dictionary.get(word_id), word_count)

total_word_count = collections.defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count
print(total_word_count)
'''
############################ Gensim: Dictionary & Corpora ###################################################

from gensim.corpora.dictionary import Dictionary
scene_one = scene_one.lower()
Tokenization_1 =  [word_tokenize(scene_one) ] 
stop_words_1 = [t for t in Tokenization_1 if t not in stopwords.words('english')]
#print(Tokenization_1)
#print(stop_words_1)
예제 #18
0
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computerv")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

예제 #19
0
myCorpus=[myStr]
print(myCorpus)
tokenized_corpus = [word_tokenize(doc.lower()) for doc in myCorpus]
"""
numOfWeight = 3  #在这里改每句话输出权重的最多数量

tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
d = Dictionary(tokenized_corpus)
#print(d)
bowcorpus = [d.doc2bow(doc) for doc in tokenized_corpus]
#print(bowcorpus)
# All the above steps are standard, but now it gets interesting:
tfidf = TfidfModel(bowcorpus)  # Create new TfidfModel from BoW corpus.
#print(tfidf[bowcorpus[0]])
#tfidf_weights = tfidf[bowcorpus[0]] # Weights of first document.
for i in range(len(bowcorpus)):
    tfidf_weights = tfidf[bowcorpus[i]]
    #print(tfidf_weights)
    sorted_tfidf_weights = sorted(tfidf_weights,
                                  key=lambda x: x[1],
                                  reverse=True)
    #print(sorted_tfidf_weights)
    print("\n")
    for term_id, weight in sorted_tfidf_weights[:numOfWeight]:
        print(d.get(term_id), weight)
#tfidf_weights[:5]                   # First five weights (unordered).
# Print top five weighted words.
#sorted_tfidf_weights = sorted(tfidf_weights, key=lambda x: x[1], reverse=True)
#for term_id, weight in sorted_tfidf_weights[:5]:
#    print(d.get(term_id), weight)
예제 #20
0
remarks = [i.split() for i in list(df['clean_text'].values)]

# Create and fit model to each statement
common_dictionary = Dictionary(remarks)
common_corpus = [common_dictionary.doc2bow(remark)
                 for remark in remarks]  # corpus to BoW
model = TfidfModel(common_corpus, id2word=common_dictionary)  # fit model

# Now get topics by briefing date
corpus_tfidf = model[common_corpus]
briefing_keywords = []
num_words = 6  # how many keywords we want to extract per date
for row, doc in enumerate(corpus_tfidf):
    d = []
    for idx, value in doc:
        word = common_dictionary.get(idx)
        score = value
        d.append((word, score))

    e = sorted(d, key=itemgetter(1))
    top = e[-num_words:][::-1]
    date = df.loc[[row]].date.values[0]
    #print(["%s, %.2f" % item for item in top])
    words = ["%s" % item[0] for item in top]
    briefing_keywords.append([date] + words)

keywords = pd.DataFrame(
    briefing_keywords,
    columns=['Date'] +
    ["Keyword " + str(word + 1) for word in range(num_words)])
keywords = keywords.set_index('Date')
예제 #21
0
print('-----' * 8)
print(tfidf[corpus[1]], '\n')

# Test: getting the word inside a doc and its tf-idf weight
doc = corpus[1]
tfidf_weights = tfidf[doc]

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words of doc
print('-----' * 8)
print('Top 5 Weighted Words for corpus[1]')
print('-----' * 8)
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

# Get the TFIDF Weights of all terms found in corpus
#  print as list of tuples, in descending order
print('\n')

# Create a container for the list of tuples: tfidf_tuples
tfidf_tuples = []

# Loop over the cleaned articles
# Get the top-5 of tfidf weight
for i in range(len(articles_cleaned)):
    doc = corpus[i]
    tfidf_weights = tfidf[doc]
    sorted_tfidf_weights = sorted(tfidf_weights,
                                  key=lambda w: w[1],
예제 #22
0
# step 1: tokenization, remove stop words and lemmatizer:
sentences = [sentence for sentence in sent_tokenize(raw_data)
             ]  # sentences here is a token
words = [
    words for words in word_tokenize(str(sentences).lower())
    if words.isalpha()
]
words_nsw = [nsw for nsw in words if nsw not in stopwords.words('spanish')]

wnlemmatizer = WordNetLemmatizer()  # class instantiation
words_nsw_lemm = [wnlemmatizer.lemmatize(w) for w in words_nsw]

# step 2: we create a dictionary and a collection (BOW = Bag-Of-Words) and a corpus:
dictionary = Dictionary([words_nsw_lemm])  # Instantiate
toboso_id = dictionary.token2id.get('toboso')
print('this is the id from a given word from the book (or any text): ',
      toboso_id)
toboso = dictionary.get(10192)
print(toboso)

corpus = dictionary.doc2bow(
    words_nsw_lemm
)  # this is the collection (brutally connected with the dictionary)
print(corpus)

id714 = dictionary.get(714)
print(id714)

end_time = time.time()
print('\nExecution time: {:.2f}s'.format(end_time - start_time))
예제 #23
0
# Create the bag-of-words: bow
bow = Counter(lemmatized)
# Print the 10 most common tokens
print(bow.most_common(10))


#_____________________________________________________________________

#Creating and querying a corpus with gensim
from gensim.corpora.dictionary import Dictionary
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)
# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")
# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))
# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]
# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

#_____________________________________________________________________

#tf-ifd (grater if the word aperas more in a doc say 50 out wo 100 words plus also aprear more in multiple docs say in 50 out of 100 docs. It will be low if onli apearing more in one doc)

from gensim.models.tfidfmodel import TfidfModel
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)
# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]
# Print the first five weights