def store_freqdists(self): """ Build NLTK frequency distributions based on feature counts and store them to Redis. """ #TODO: this step and the above may possibly be combined word_fd = FreqDist() label_word_freqdist = ConditionalFreqDist() pos_words = self.r.zrange('positive_wordcounts', 0, -1, withscores=True, desc=True) neg_words = self.r.zrange('negative_wordcounts', 0, -1, withscores=True, desc=True) assert pos_words and neg_words, 'Requires wordcounts to be stored in redis.' #build a condtional freqdist with the feature counts per label for word, count in pos_words: word_fd.inc(word, count) label_word_freqdist['positive'].inc(word, count) for word,count in neg_words: word_fd.inc(word, count) label_word_freqdist['negative'].inc(word, count) self.pickle_store('word_fd', word_fd) self.pickle_store('label_fd', label_word_freqdist)
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500): fd = FreqDist() cond_fd = ConditionalFreqDist() word_counts = {} for cat in categories: for word in corpus.words(categories=[cat]): word = word.lower().strip(".!?:,/ ") if not word.isalpha() or word in stopset: continue fd.inc(word) cond_fd[cat].inc(word) word_counts[cat] = cond_fd[cat].N() total_word_count = sum(word_counts.values()) word_scores = collections.defaultdict(int) for word, freq in fd.iteritems(): for cat in categories: cat_word_score = BigramAssocMeasures.chi_sq( cond_fd[cat][word], (freq, word_counts[cat]), total_word_count) word_scores[word] += cat_word_score informative_words = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:count] return set([w for w, s in informative_words])
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens] else: raise ValueError("No such feature type: %s" % feature_type); matrix[:,i] = v return matrix
def create_word_scores(posWords,negWords,posTag,negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #积极词的数量 neg_word_count = cond_word_fd[negTag].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd.inc(word) label_word_fd[label].inc(word) n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].iteritems(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def summarize(self, input, num_sentences ): s=[] punt_list=['.',',','!','?'] summ_sentences = [] sentences=input #sentences = sent_tokenize(input) lowercase_sentences =[sentence.lower() for sentence in sentences] #print lowercase_sentences saito=' '.join(sentences) s=input ts=''.join([ o for o in s if not o in punt_list ]).split() lowercase_words=[word.lower() for word in ts] words = [word for word in lowercase_words if word not in stopwords.words()] word_frequencies = FreqDist(words) most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # add sentences with the most frequent words if(len(s) < num_sentences): num_sentences=len(s) for word in most_frequent_words: for i in range(len(lowercase_sentences)): if len(summ_sentences) < num_sentences: if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]): summ_sentences.append(lowercase_sentences[i]) else: break if len(summ_sentences) >= num_sentences: break # reorder the selected sentences summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) ) return summ_sentences
def get_top_words(directory, n, file): num_docs = 0.0 flist = {} result = {} for f in os.listdir(directory): #stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt" num_docs+=1 rawContents = load_file_tokens(directory+'/'+f) fdist = FreqDist( rawContents ) normalF = max(fdist.values()) for key in fdist.keys(): fdist[key]=float(float(fdist[key])/normalF) flist[directory+'/'+f] = fdist for key in flist[file].keys(): num_appear=0 for key_file in flist.keys(): if key in flist[key_file].keys(): num_appear+=1 result[key] = flist[file][key]*math.log(num_docs/(num_appear)) sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True) top_x = sorted_x[:n] result = [] for item in top_x: result.append(item[0]) return result
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. :return: the trained model :rtype: HiddenMarkovModelTagger :param labelled_sequences: the training data, a set of labelled sequences of observations :type labelled_sequences: list :param kwargs: may include an 'estimator' parameter, a function taking a FreqDist and a number of bins and returning a CProbDistI; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols) known_states = set(self._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts is None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in known_states: self._states.append(state) known_states.add(state) if symbol not in known_symbols: self._symbols.append(symbol) known_symbols.add(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
class VocabBuilder: """ Creates a vocabulary after scanning a corpus. """ def __init__(self, lang="english", min_length=3, cut_first=100): """ Set the minimum length of words and which stopword list (by language) to use. """ self._counts = FreqDist() self._stop = set(stopwords.words(lang)) self._min_length = min_length self._cut_first = cut_first print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10])) def scan(self, words): """ Add a list of words as observed. """ for ii in [x.lower() for x in words if x.lower() not in self._stop \ and len(x) >= self._min_length]: self._counts.inc(ii) def vocab(self, size=5000): """ Return a list of the top words sorted by frequency. """ if len(self._counts) > self._cut_first + size: return self._counts.keys()[self._cut_first:(size + self._cut_first)] else: return self._counts.keys()[:size]
def make_summary( text): sent = [] stemmed = [] tokens = word_tokenize(text) sent = sent_tokenize(text) for token in tokens: if token in stopwords.words('english'): tokens.remove(token) stemmer = PorterStemmer() for token in tokens: stemmed.append(stemmer.stem(token)) #freq(stemmed) for word in stemmed: word.lower() word_freq = FreqDist(stemmed) most_freq_words = [pair[0] for pair in word_freq.items()[:60]] working_sent = [sentence.lower() for sentence in sent] out_sent = [] for word in most_freq_words: for i in range(0,len(working_sent)): if (word in working_sent[i] and sent[i] not in out_sent): out_sent.append(sent[i]) break if len(out_sent) >= 5: break if len(out_sent) >= 5: break return reorder(out_sent,text)
def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = islice(fd.keys(), num) print(tokenwrap(words)) else: print("No matches")
def most_frequent_words(path,top): root_path = "./"+path; writers = os.listdir(root_path); word_set = set(); for writer in writers: if writer.find(".") != -1: continue; inside_folder = root_path + "//" +writer; files = os.listdir(inside_folder); formated_text = ""; for file in files: file_path = root_path + "//" +writer+"//"+ file; fw = open(file_path,"r",encoding="utf8"); article = fw.read(); #print(article); formated_text+=" "; formated_text += formatText(article); fw.close(); words = get_bigrams(formated_text); fdist = FreqDist(w for w in words if len(w) > 1 and isEnglish(w) == False and w != "``"); keys = fdist.most_common(top); for key in keys: #print(str(key[0]) + " , " + str(key[1]) + "\n"); word_set.add(key[0]); print(word_set); fw = open("./Features/Bigrams.csv","w",encoding="utf8"); for word in word_set: fw.write(word); fw.write("\n"); fw.close();
def mostCommWords(self, tag, pos_tag_pattern): """ This is a help method for mostCommNouns and mostCommVerbs. Argument: tag -- a hashtag that we want to compute the most commonly hashtag with pos_tag_pattern -- the regular expression that used to match the POS tags return: a list of the top 20 nouns associated with the input hashtag """ words={} topTwenty=[] j = 0 for line in self.lines: hasTag = False for t in self.tokenizer(line, hashtag_pattern): if t == tag: hasTag = True break if hasTag: counts = FreqDist() tokens = self.tokenizer(line, word_pattern) pos = nltk.pos_tag(tokens) for p in pos: if re.match(pos_tag_pattern,p[1]): counts.inc(p[0]) for n in counts.keys(): if words.has_key(n): words[n] = words[n]+counts[n] else: words[n] = counts[n] words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True) for i in range(0,20): topTwenty.append(words_sorted_by_counts[i][0]) return topTwenty
def train_MLT(self, tagged_train_data, untagged_training_data): """ Builds a most likely tag tagger from the given tagged training data as WORDS :param train_data: :return: model """ # find the set of words words = set() for sent in untagged_training_data: for word in sent: words.add(word) # Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........} mlt_dict = dict() # Initialize keys and values to it for word in words: mlt_dict[word] = dict() # Compute the freq dist of tagged words tagged_words_fdist = FreqDist(tagged_train_data) for tagged_word, count in tagged_words_fdist.items(): (mlt_dict[tagged_word[0]])[tagged_word] = count # Update the dict to contain the most likely tag for each word #for word, inside_dict in mlt_dict.items(): # max_val = max(inside_dict.values()) # inside_dict = print("Training is done!") return mlt_dict
def fun10(): """frequency distribution""" fdist1 = FreqDist(text1) # print fdist1 vocabulary1 = fdist1.keys() # print vocabulary1[:50] fdist1.plot(50, cumulative=True)
def GetHighInformationWordsChi(num_bestwords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] +=1 label_word_fd['pos'][word.lower()] +=1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] +=1 label_word_fd['neg'][word.lower()] +=1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords] bestwords = set([w for w, s in best]) return bestwords
def ngram4All(): to_save_folder = "./#Ngram_4[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8") text = fw.read(); words = word_tokenize(text); valid_word = [w for w in words if len(w) > 1 and w != "``"] nlist4 = [] vlen = len(valid_word); for i in range(0,vlen-3): nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3]) fdist = FreqDist(w for w in nlist4) keys = fdist.most_common(len(fdist.keys())) dataFreq = "" for key in keys: dataFreq += str(key[0])+ "," + str(key[1]) + "\n" make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8") writer.write(dataFreq) fw.close() writer.close()
def get_frequency(data_file,all_vocab): input_file = open(data_file, "r") input_file_contents = input_file.read() words = nltk.tokenize.word_tokenize(input_file_contents, 'english') fdist = FreqDist(words) print(fdist) output_file = open("../Training/vocab_freq.txt", "w") for word, frequency in fdist.most_common(4000): if word in all_vocab and word!='+' and word!='-': output_file.write(word + " : " + str(frequency) + "\n") output_file.close() return 1 #data = "data.txt" #stop_words = "stopwords.txt" #accuracy= multinomial_naive_bayes_unigram(data, data, stop_words) #print(accuracy) #print("Separating Done!!")
def BigramAll(): to_save_folder = "./#Bigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1 : continue; folder_name = "./" + folder + "/" data_path = folder_name+"data.doc"; fw = open(data_path,"r",encoding="utf8"); text = fw.read(); words = word_tokenize(text); big = list(bigrams(w for w in words if len(w) > 1 and w != "``")); myBig = [] for bi in big: myBig.append(bi[0]+" "+bi[1]); fdist = FreqDist(str(w) for w in myBig); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n"; make_sure_path_exists(to_save_folder+folder) writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def trigramAll(): to_save_folder = "./#Trigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8"); text = fw.read(); words = word_tokenize(text); valid_word = [w for w in words if len(w) > 1 and w != "``"]; tri_list = []; vlen = len(valid_word); for i in range(0,vlen-2): tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]); fdist = FreqDist(w for w in tri_list); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n"; make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def buildCategoryDictionary(category): tweetList = twitter_fetch.get_tweets_text(classn=category) freq = FreqDist() for tweet in tweetList: freq.update(word for word in tokenizeTweet(tweet)) saveDictionaryToFile(freq, category + categoryDictFilePath) return freq
def classify(self, feats): counts = FreqDist() for classifier in self._classifiers: counts.inc(classifier.classify(feats)) return counts.max()
def word_tag_model(words, tagged_words, limit=200): fd = FreqDist(words) cfd = ConditionalFreqDist(tagged_words) most_freq = (word for word, count in fd.most_common(limit)) return dict((word, cfd[word].max()) for word in most_freq)
def get_term_freq_dict(data): # Change it to lower case lower_data = data.lower() # Tokenize it tokens = word_tokenize(lower_data) freq_dist = FreqDist(tokens) # Lemmatize it word_freq = {} for term in freq_dist.keys(): lemmatize_term = wordnet.lemmatize(term) val = freq_dist.get(term) # If it exist in word_freq, add value if lemmatize_term in word_freq: freq = word_freq[lemmatize_term] word_freq[lemmatize_term] = freq + val # Else, assign value else: word_freq[lemmatize_term] = val return word_freq
def create_word_bigram_scores(posWords, negWords): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in neg: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def choose_tag(self, tokens, index, history): tags = FreqDist() for tagger in self._taggers: tags.inc(tagger.choose_tag(tokens, index, history)) return tags.max()
def scores(self, docId): """ Return the score from the given document to every other document in the index. Documents not listed are assumed to have no similarity detected by shared terms. :param docId: ID of doc to compare other docs to. :returns: A list of tuples of (document ID, similarity score). Larger scores are better. """ if not self._idf: self._computeIdfs() # Track the scores # docScores = FreqDist() for termid, freq in self.termFrequencies[docId].iteritems(): # Find the frequency with which this term appears in other documents. # inverseDocumentFrequency = self._idf[termid] for otherDocId in self.termsToDocuments[termid]: if otherDocId == docId: # Skip this document continue # Find the term frequency of the term in the other document. # otherFreq = self.termFrequencies[docId][termid] # Score proportional to product of frequencies times the inverse of # the document frequency. # docScores.inc(otherDocId, freq * otherFreq * inverseDocumentFrequency) return docScores
def create_word_scores(posWords, negWords): file_scores = file("cn_sample_data/scores.txt", "w") #迭代,将多个序列合并 word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in negWords: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) for key in word_scores: file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n") file_scores.close() return word_scores
def train(labeled_featuresets, estimator=ELEProbDist): label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in featureset.items(): feature_freqdist[label, fname].inc(fval) feature_values[fname].add(fval) fnames.add(fname) for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples-count) feature_values[fname].add(None) label_probdist = estimator(label_freqdist) feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return NaiveBayesClassifier(label_probdist, feature_probdist)
#This step is needed to collapse runs of space characters into one text = ' '.join(text.split()) """ spans = TOKENIZER.span_tokenize(text) tokens = (text[begin : end] for (begin, end) in spans) """ tokens = TOKENIZER.tokenize(text) for ngram in ngrams(tokens, order): #Increment the count for the bigram. Automatically handles any #bigram not seen before. The join expression turns 2 separate #single‑character strings into one 2‑character string if ' ' not in ''.join(ngram): frequencies[''.join(ngram)] += 1 return if __name__ == '__main__': #Initialize the mapping frequencies = FreqDist() #The order of the ngrams is the first command line argument ngram_order = int(sys.argv[2]) #Pull the input data from the console count_ngrams(frequencies, ngram_order) outputfp = open(sys.argv[3], 'w') json.dump(dict(frequencies), outputfp) print('Stored frequencies of {} encountered N‑grams.'.format( len(frequencies)))
#for x in sentences: # print(x) # print("----") #divide text into words, and print words = word_tokenize(text) #print(len(words)) #for x in words: # print(x) # print("----") ############################# #Find the frequence of words in text from nltk.probability import FreqDist fdist = FreqDist(words) #print the 10 most common words mostCommon10 = fdist.most_common(10) #for x in mostCommon10: # print(x) #plot a graph of word distribution import matplotlib.pyplot as plot #fdist.plot(10) ############ #remove punctuation marks words_no_punc = [] for w in words: if w.isalpha(): words_no_punc.append(w.lower()) #print(words_no_punc)
def leaf(labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() return DecisionTreeClassifier(label)
from nltk.tokenize import word_tokenize Tokens = word_tokenize(dataset) #print (Tokens) #No. of tokens in the dataset len(Tokens) #Freq of occurence of distinct elements from nltk.probability import FreqDist fdist = FreqDist() for word in Tokens: fdist[word.lower()] += 1 fdist fdist.plot(20) #-------------------------Stemming---------------------------------------- from nltk.stem import PorterStemmer pst = PorterStemmer()
paragraph = input("Enter the paragraph \n") para = "" for i in paragraph: if not i in '.,!?():': para += i print(para+ "\n \n \n") stop = set(stopwords.words('english')) listop = [i for i in para.lower().split() if i not in stop] print(listop) print("\n") ps = nltk.stem.PorterStemmer() listopandstem = [ps.stem(i) for i in listop] print(listopandstem) print("\n \n") fdist = FreqDist(listopandstem) #print(fdist) vowel = [word for word in listop if word[0] in 'aeiou'] m = term_freq(vowel) print(m) print("\n") x = input("Enter search word") print(m[x], x) li_term = m.keys() li_freq = m.values() df = pd.DataFrame({'li_freq':li_freq, 'li_term':li_term}) writer = ExcelWriter('abc.xlsx') df.to_excel(writer,'Sheet1',index=False)
counter = Counter() word = [] filtered = [word for word in text if (len(word) >= 5 and word not in stopWords and word not in stops and word != int)] counts = Counter(filtered) pbar = pyprind.ProgBar(len([filtered]), title='Counting word occurrences...') word_counts = sorted(counts, key=counts.get, reverse=True) word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)} counts.update(text) print (counts.update(text)) pbar.update() fdist = FreqDist(filtered) vocab = fdist.keys() print (vocab[:30]) mpl_fig = plt.figure(figsize=(12, 8)) ax = mpl_fig.add_subplot(111) plt.title("Words with Highest Frequencies in Immunotherapy-related Tweets") ax.set_xlabel('Term') ax.set_ylabel('Frequency') plt.xlabel("Term") plt.ylabel("Frequency") mpl_fig.tight_layout() fdist.plot(40, cumulative=True) mpl_fig.savefig("linechart.png")
import nltk from nltk.corpus import gutenberg from nltk.probability import FreqDist import matplotlib import matplotlib.pyplot as plt matplotlib.use('TkAgg') fd = FreqDist() for text in gutenberg.fileids(): for word in gutenberg.words(text): fd[word] += 1 ranks = [] freqs = [] for rank, word in enumerate(fd): ranks.append(rank + 1) freqs.append(fd[word]) plt.loglog(ranks, freqs) plt.xlabel('frequency(f)', fontsize=14, fontweight='bold') plt.ylabel('rank(r)', fontsize=14, fontweight='bold') plt.grid(True) plt.show()
def get_frequency(wordsSelected): return FreqDist(wordsSelected)
# request the result of the url response = requests.get(topStoriesURL).json() # store the result in a json file and read the file contents main_functions.save_to_file(response, "JSON_Files/topStories.json") topStoriesOutput = main_functions.read_from_file("JSON_Files/topStories.json") # the following block of code cleans up the list variable so only desirable words are left toProcess = "" for i in topStoriesOutput["results"]: toProcess = toProcess + i["abstract"] words = word_tokenize(toProcess) fdist = FreqDist(words) words_no_punc = [] for w in words: if w.isalpha(): words_no_punc.append(w.lower()) fdist2 = FreqDist(words_no_punc) clean_words = [] for w in words_no_punc: if w not in stopwords: clean_words.append(w) fdist3 = FreqDist(clean_words)
return [[word for word in headline if word not in stop_words] for headline in headlines] def export(data, data_name): csv_filename = data_name + ".csv" with open(csv_filename, 'w') as out: csv_out = csv.writer(out) csv_out.writerow(['name', 'count']) for row in data: csv_out.writerow(row) input_filename = "abcnews-date-text.csv" output_filename = "list_data_conflicts" df = pd.read_csv(input_filename) headlines = list(df["headline_text"].as_matrix()) keywords = ["refugee", "cyber"] tokenized_headlines = tokenize_headlines(headlines) for keyword in keywords: filtered_headlines = remove_stopwords( filter_headlines(tokenized_headlines, [keyword])) tokens = list(itertools.chain.from_iterable(filtered_headlines)) distribution = FreqDist(tokens) most_common = distribution.most_common(70) export(data=most_common, data_name=keyword)
import pandas as pd import numpy as np import nltk import os import nltk.corpus from nltk.tokenize import RegexpTokenizer from nltk.probability import FreqDist from nltk.stem import WordNetLemmatizer with open("C:/Users/David/Documents/GIT/ENRON/master/arnold-j_mails.txt", "r", encoding="utf-8") as file: text = file.read() lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer(r'\w+') text = lemmatizer.lemmatize(text) liste = tokenizer.tokenize(text) fdist = FreqDist(liste) top = fdist.most_common(5000) print((top[5])[1]) with open("C:/Users/David/Documents/GIT/ENRON/master/nb_mots.txt", "w", encoding="utf-8") as file: file.write(str(top))
class TnT(TaggerI): ''' TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. ''' def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): ''' Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) ''' # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = ['BOS', 'BOS'] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C = True self._wd[w].inc(t) self._uni.inc((t, C)) self._bi[history[1]].inc((t, C)) self._tri[tuple(history)].inc((t, C)) history.append((t, C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t].inc('EOS') # compute lambda values from the trained frequency distributions self._compute_lambda() #(debugging -- ignore or delete me) #print "lambdas" #print i, self._l1, i, self._l2, i, self._l3 def _compute_lambda(self): ''' creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) ''' # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].samples(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag] - 1), (self._tri[history].N() - 1)) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) / 2.0 tl3 += float(self._tri[history][tag]) / 2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) / 2.0 tl2 += float(self._tri[history][tag]) / 2.0 # otherwise there might be a problem # eg: all values = 0 else: #print "Problem", c1, c2 ,c3 pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): ''' Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 ''' if v2 == 0: return -1 else: return float(v1) / float(v2) def tagdata(self, data): ''' Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples ''' res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): ''' Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples ''' current_state = [(['BOS', 'BOS'], 1.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t, C) = tags[i + 2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): ''' :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the probability associated with each tag combination :type current_states : [([tag, ],prob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag ''' # if this word marks the end of the sentance, # return the most probable tag if sent == []: (h, p) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initalise the flag for this word C = False if self._C and word[0].isupper(): C = True # if word is known # compute the set of possible tags # and their associated probabilities if word in self._wd.conditions(): self.known += 1 for (history, curr_sent_prob) in current_states: probs = [] for t in self._wd[word].samples(): p_uni = self._uni.freq((t, C)) p_bi = self._bi[history[-1]].freq((t, C)) p_tri = self._tri[tuple(history[-2:])].freq((t, C)) p_wd = float(self._wd[word][t]) / float(self._uni[(t, C)]) p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri p2 = p * p_wd probs.append(((t, C), p2)) # compute the result of appending each tag to this history for (tag, prob) in probs: new_states.append((history + [tag], curr_sent_prob * prob)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ('Unk', C) # otherwise apply the unknown word tagger else: [(_w, t)] = list(self._unk.tag([word])) tag = (t, C) for (history, prob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by prob # set is now ordered greatest to least probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
Frequency Analysis Letter Word Bigrams Plots # Input # NLTK example: frequence analysis import nltk from nltk.corpus import gutenberg from nltk.probability import FreqDist # get raw text from "Sense and Sensibility" by Jane Austen raw = gutenberg.raw("austen-sense.txt") fd_letters = FreqDist(raw) words = gutenberg.words("austen-sense.txt") fd_words = FreqDist(words) sas = nltk.Text(words) # these 2 lines let us size the freq dist plot import matplotlib.pyplot as plt plt.figure(figsize=(20, 5)) # frequency plot for letters from SAS fd_letters.plot(100) # Output **Shows graph plot
from nltk.tokenize import sent_tokenize sentencas = sent_tokenize(texto) palavras = word_tokenize(texto.lower()) from nltk.corpus import stopwords from string import punctuation stopwords = set(stopwords.words('portuguese') + list(punctuation)) palavras_sem_stopwords = [ palavra for palavra in palavras if palavra not in stopwords ] from nltk.probability import FreqDist frequencia = FreqDist(palavras_sem_stopwords) from collections import defaultdict sentencas_importantes = defaultdict(int) for i, sentenca in enumerate(sentencas): for palavra in word_tokenize(sentenca.lower()): if palavra in frequencia: sentencas_importantes[i] += frequencia[palavra] from heapq import nlargest idx_sentencas_importantes = nlargest(4, sentencas_importantes, sentencas_importantes.get)
#getting main text file file = open('sampledata.txt', 'r') filetext = file.read() #Cleaning the text from the unwanted characters, other methods like regular expression are applicable but this one is easier filetext = filetext.replace('<s>', '') filetext = filetext.replace('</s>', '') filetext = filetext.replace('-', '') tokens = word_tokenize(filetext) print(tokens) #Getting the vocabulary file = open('sampledata.vocab.txt', 'r') filetext = file.read() vocab = word_tokenize(filetext) print(vocab) fr = FreqDist(tokens) print('X | P(X)') print('___________________') for s in fr.items(): for d in vocab: if d == s[0]: print(d, ' | ', (s[1] / len(tokens)).__round__(2)) UNK = 0 for d in vocab: r = [item for item in fr if item[0] != d] isemp = not all(r) if isemp == True: UNK += (r[1] / len(tokens)).__round__(2) print('UNK | ', UNK) print('== UNIGRAMS AFTER LAPLACE SMOOTHING ==') lpt = LaplaceProbDist(fr)
def build_train_data(): global train_word_id global train_data_single global train_info global train_tags global stop_words train_word_id = [] train_data_single = {} train_info = {} train_tags = ['NULL'] stop_words = [] word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 with open(STOP_FILE, 'r') as fin: for line in fin: line = line.strip() if not line or line[0] == '#': continue stop_words.append(line) print("STOP WORD SIZE:%d\n" % (len(stop_words))) with open(WHITE_FILE, 'r') as fin: for line in fin: line = line.strip() if not line or line[0] == '#': continue white_words.append(line) print("WHITE WORD SIZE:%d\n" % (len(white_words))) for parent, dirname, filenames in os.walk(DATA_DIR): for filename in filenames: if filename[-6:] != '_p.txt': continue tag_name = filename[:-6] print("正在处理:%s" % (tag_name)) train_tags.append(tag_name) tag_id = train_tags.index(tag_name) train_info[tag_id] = [] line_num = 0 with open(DATA_DIR + '/' + filename, 'r') as fin: for line in fin: line_num += 1 if not line_num % 1000: print('LINE:%d' % (line_num)) line = line.strip() objs = [] for item in line.split(): if len(item) == 1 and item not in white_words: continue item_id = term_to_id(item) if item_id not in objs: word_fd[item_id] += 1 cond_word_fd[tag_id][item_id] += 1 objs.append(item_id) train_info[tag_id].append(objs) print('Randomize>>>') cond_word_sum = {} for tag in train_tags[1:]: tag_id = train_tags.index(tag) shuffle(train_info[tag_id]) cond_word_sum[tag_id] = cond_word_fd[tag_id].N() print("SUM:%s->%d" % (tag, cond_word_sum[tag_id])) total_w_count = word_fd.N() print("TOTAL:%d" % (total_w_count)) global sorted_word_scores sorted_word_scores = {} word_scores = {} word_scores_sub = {} print("CALC CHI-SQUARE...") for word, freq in word_fd.items(): word_scores[word] = 0 for tag in train_tags[1:]: tag_id = train_tags.index(tag) word_scores[word] += \ BigramAssocMeasures.chi_sq(cond_word_fd[tag_id][word], (freq, cond_word_sum[tag_id]), total_w_count) sorted_word_scores = sorted(word_scores.items(), key=lambda e: e[1], reverse=True) del cond_word_sum del word_fd del cond_word_fd return
import nltk from nltk.tokenize import word_tokenize from nltk.probability import FreqDist from nltk.corpus import stopwords from nltk.stem import PorterStemmer import pandas as pd text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard""" tokenized_word=word_tokenize(text) # print(tokenized_word) freqdist = FreqDist(tokenized_word) stop_w = set(stopwords.words("english")) filtered_sent = [] for w in tokenized_word: if w not in stop_w: filtered_sent.append(w) # print(filtered_sent) ps = PorterStemmer lem = nltk.WordNetLemmatizer() stemmed_words = [] # for w in filtered_sent: # stemmed_words.append(ps.stem(w)) # lemmatized_words = [] # for w in filtered_sent: # lemmatized_words.append(lem.lemmatize(w,"v")) nltk.pos_tag(tokenized_word) data = pd.read_csv("train.tsv", sep='\t')
# Get rid of the stop words stopwords = stopwords.words("english") clean_words = [] for w in words_no_punctuation: if w not in stopwords: clean_words.append(w) # Display the Frequency option st.subheader("II - Frequency Distribution") frequencyChk = st.checkbox("Click here to generate frequency distribution") # Display frequency distribution graph if frequencyChk: fdist = FreqDist(clean_words) most_common_words = fdist.most_common(10) top_words = [] word_count = [] for j in most_common_words: top_words.append(j[0]) word_count.append(j[1]) plt.figure(figsize=(10, 6)) plt.plot(top_words, word_count, color='green', linewidth=2, marker='d') plt.xlabel('Words') plt.ylabel('Count') plt.grid() st.pyplot()
#Split by space train_text = train_text.strip().split(" ") #identify tag and add space #split by space for 2d list with word and its corresponding tag train_text2=[] for row in train_text: row=re.sub(r'(\/[A-Z]*(\|?)[^\d|^/]+)$',r' \1',row) train_text2.append(row.strip().split(" ")) #dataframe for easy interpretation cols = ["words","tags"] df = pd.DataFrame(data=train_text2, columns=cols) #Frequency of tags (tn) fdist_tags = FreqDist(df["tags"]) #Frequency of tags corresponding to the words (wordn,t1...tn) word_tag=df.groupby(["words","tags"]).size() #Frequency of tag with its previous tag (tn-1, tn) two_tags =[] for x in range(len(df)-1): two_tags.append(str(df["tags"][x])+" "+str(df["tags"][x+1])) fdist_two_tags = FreqDist(two_tags) #testing #Tokenize test data test_tokens = test_text.strip().split(" ")
def thongKeTop(text): fdist2 = FreqDist(text) #Thống kê số từ nhiều nhất dictOj = dict(fdist2) return dictOj
''' import nltk from nltk.tokenize import word_tokenize from nltk.corpus import brown from nltk.probability import FreqDist #text from Harper Lee, To Kill a Mockingbird text = ''' Atticus said to Jem one day, “I’d rather you shot at tin cans in the backyard, but I know you’ll go after birds. Shoot all the blue jays you want, if you can hit ‘em, but remember it’s a sin to kill a mockingbird.” That was the only time I ever heard Atticus say it was a sin to do something, and I asked Miss Maudie about it. “Your father’s right,” she said. “Mockingbirds don’t do one thing except make music for us to enjoy. They don’t eat up people’s gardens, don’t nest in corn cribs, they don’t do one thing but sing their hearts out for us. That’s why it’s a sin to kill a mockingbird. ''' text_tags = nltk.pos_tag(word_tokenize(text)) print(text_tags) frequent = FreqDist(tag for (word, tag) in text_tags) import collections word_counts = collections.Counter((words[0] for words in text_tags if len(words[0])>1)) # just words, not marks print('============================================================================================') print(f'The five most frequent words are: {word_counts.most_common(5)}') # Five more frequent print('============================================================================================')
def find_frequent_words(self, all_words): freqdist = FreqDist(word.lower() for word in all_words if word.lower() in self._valid_words) return freqdist
tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids) if not tagged_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print 'loading %s' % args.corpus ############## ## counting ## ############## wc = 0 tag_counts = FreqDist() taglen = 7 word_set = set() if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']: kwargs = {'simplify_tags': True} else: kwargs = {} for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs): if len(tag) > taglen: taglen = len(tag) if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags: tag = simplify_wsj_tag(tag)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Dec 31 08:21:05 2016 @author: megan """ from nltk.tokenize import word_tokenize from nltk.probability import FreqDist from nltk.corpus import stopwords fname = 'actbacFB.txt' frequency = 50 with open(fname, 'r', encoding="utf-8") as f: data = f.read().replace('\n', '') # get list of most frequent words words = word_tokenize(data) lowercase_words = [word.lower() for word in words if word not in stopwords.words() and word.isalpha()] word_frequencies = FreqDist(lowercase_words) most_frequent_words = FreqDist(lowercase_words).most_common(frequency) # print out the keywords more nicely for pair in most_frequent_words: print(pair[0],":",pair[1])
paper_txt = lemmatizer.lemmatize(paper_txt) tokens = word_tokenize(paper_txt) # remove stop-word token stopwords.words('english') for token in tokens: token = token.lower() clean_tokens = tokens[:] for token in tokens: if token in stopwords.words('english') or token == "We" or token == "The": clean_tokens.remove(token) fdist = FreqDist(word.lower() for word in clean_tokens) print(fdist.most_common(20)) """ for key, val in fdist.items(): print(str(key) + ':' + str(val)) """ # fdist.plot(20, cumulative=False) """ nlu = NaturalLanguageUnderstandingV1( username='******', password='******', version='2018-03-16' ) response = nlu.analyze(
allWords = [] for wordList in tokens: allWords += wordList # Remove All Stopwords from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) allWordsFinal = [] for w in allWords: if w not in stop_words: allWordsFinal.append(w) # Run frequency distribution of words and plot on a graph. from nltk.probability import FreqDist fdist = FreqDist(allWordsFinal) import matplotlib.pyplot as plt fdist.plot(80) plt.show() #last_75 = FreqDist(dict(fdist.most_common()[-480:])) #last_75.plot() # Create and generate a word cloud image of most frequent words. from wordcloud import WordCloud words = (" ").join(allWordsFinal) wc = WordCloud(width=1600, height=800, background_color="white", max_words=200, contour_width=3).generate(words)
def create_frequency_dist(words): fdist = FreqDist(word.lower() for word in words) return fdist
txt.translate({ord(c): None for c in string.whitespace}) txt = txt.replace("gays", "gay").replace("lesbians", "lesbian").replace("seattles", "seattle").replace("citys", "city") print(txt) stopwords = set(STOPWORDS) commonwords = {"time", "one", "began", "among", "another", "see", "part", "many", "day", "day", "way", "times", "still", "news", "three", "came", "became", "made", "wanted", "seemed", "made", "now", "society", "ing", "time", "first", "new", "called", "said", "come", "two", "city", "group", "state", "year", "case", "member", "even", "later", "month", "years", "much", "week", "county", "name", "example" "well", "members", "us", "say", "s"} stopwords.update(commonwords) # tokenize and calculate the word frequencies tokens = nltk.tokenize.word_tokenize(txt) fDist = FreqDist(tokens) print(fDist.most_common(20)) # remove the stop words and common words filtered_fDist = nltk.FreqDist(dict((word, freq) for word, freq in fDist.items() if word not in stopwords)) print(filtered_fDist) filtered_fDist.plot(20) print("generating wordcloud...") mask_array = npy.array(Image.open("img/cloud.jpg")) wc = WordCloud(font_path='arial', background_color="white", max_words=50, prefer_horizontal=1, mask=mask_array, scale=3, stopwords=stopwords, collocations=False) wc.generate_from_frequencies(filtered_fDist) # wc.generate(txt) wc.to_file(wcPath)
new2.append(word) text = "" for i in range(len(new2)): text += new2[i] + " " text = text.lower() #один регистр text = text.replace(".", "") text = text.replace(",", "") text = text.replace("!", "") text = text.replace("-", "") text = text.replace("_", "") text = text.replace("?", "") text = text.replace("[", "") text = text.replace("]", "") text = text.replace("'", "") text = text.replace(";", "") text = text.replace("''", "") text = text.replace(":", "") text = text.replace("``", "") text_tokens = word_tokenize(text) text = nltk.Text(text_tokens) fdist = FreqDist(text) print(fdist.most_common(10))
stopwords = stopwords.words('english') def remove_stopwords_and_punctuation(words): return [w for w in words if w.isalpha() and w not in stopwords] #Helper function. Given a list of reviews, return a list of all the words in those reviews #To understand this look at the description of functools.reduce in https://docs.python.org/3/library/functools.html def get_all_words(amazon_reviews): return reduce(lambda words, review: words + review.words(), amazon_reviews, []) #A frequency distribution over all words in positive book reviews pos_freqdist = FreqDist( remove_stopwords_and_punctuation(get_all_words(pos_train))) neg_freqdist = FreqDist( remove_stopwords_and_punctuation(get_all_words(neg_train))) def most_frequent_words(freqdist, k): return [word for word, count in freqdist.most_common(k)] def words_above_threshold(freqdist, k): return [word for word in freqdist if freqdist[word] > k] top_pos = most_frequent_words(pos_freqdist, 100) top_neg = most_frequent_words(neg_freqdist, 100) above_pos = words_above_threshold(pos_freqdist, 100)