def __init__(self, words, sentences, language): self.num_words = len(words) self.unique_words = len(set(words)) self.num_sentences = len(sentences) self.average_sentence_length = round(self.num_words / self.num_sentences) self.lexical_diversity = round(self.num_words / self.unique_words) fdist = FreqDist(words) stop_words = stopwords.words(language) not_stopwords = [w for w in words if w not in stop_words] fdist2 = FreqDist(not_stopwords) self.fifty_first_words = fdist.most_common(50) self.hundreds_nsw = fdist2.most_common(300) bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.apply_freq_filter(10) self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50) trigram_measures = TrigramAssocMeasures() finder3 = TrigramCollocationFinder.from_words(words) finder3.apply_freq_filter(10) self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50) self.stcs_width_words = [' '.join(sent) for sent in sentences if "malheureusement" in sent.lower()]
def freqSingle(list): global nouns global adjectives #fig, axs = plt.subplots(1,2) varPOS = [nltk.pos_tag(list)] ################################################## #ALL LANGUAGE - FREQUENCY varAll = FreqDist(list) ##USE THIS TO PRINT THE TOP WORDS print "TOP TERMS" varAll_common = varAll.most_common(25) print varAll_common print "" #PLOT TOP TERMS #varAll.plot(25, cumulative=False, title='All Language') #plt.show() ################################################## #NOUNS - FREQUENCY nouns = [] for word,pos in varPOS[0]: if pos in ['NN', 'NNP']: nouns.append(word) varNouns = FreqDist(nouns) ##USE THIS TO PRINT THE TOP NOUNS print "TOP NOUNS" varNouns_common = varNouns.most_common(25) print varNouns_common print "" #PLOT TOP NOUNS #varNouns.plot(25, cumulative=False, title='Nouns') #plt.show() ################################################## #ADJECTIVES - FREQUENCY adjectives = [] for word,pos in varPOS[0]: if pos in ['JJ', 'JJR', 'JJS']: adjectives.append(word) varAdjectives = FreqDist(adjectives) ##USE THIS TO PRINT THE TOP ADJECTIVES print "TOP ADJECTIVES" varAdjectives_common = varAdjectives.most_common(25) print varAdjectives_common print ""
def _count(self, words): """ >>> wordCounter()._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3']) [('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)] >>> wordCounter()._count([]) [] >>> wordCounter(words_per_message=-1)._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3']) [('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)] """ fdist1 = FreqDist(words) if (self.words_per_message > 0): return fdist1.most_common(self.words_per_message) else: return fdist1.most_common()
def BigramAll(): to_save_folder = "./#Bigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1 : continue; folder_name = "./" + folder + "/" data_path = folder_name+"data.doc"; fw = open(data_path,"r",encoding="utf8"); text = fw.read(); words = word_tokenize(text); big = list(bigrams(w for w in words if len(w) > 1 and w != "``")); myBig = [] for bi in big: myBig.append(bi[0]+" "+bi[1]); fdist = FreqDist(str(w) for w in myBig); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n"; make_sure_path_exists(to_save_folder+folder) writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def sentanceLenFrequency(): to_save_folder = "./#SentanceLenFreq[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8"); text = fw.read(); words = word_tokenize(text); sents = getSentancesTokens(text); freq=[] for sent in sents: sent_len = getSentanceLen(sent); if sent_len==147: print(sent); if sent_len>0: freq.append(sent_len) fdist = FreqDist(freq); keys = fdist.most_common(); dataFreq = "Sentance Len,Freqency\n" for key in sorted(keys): dataFreq+= str(key[0])+","+str(key[1]) +"\n"; make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder +"[data]"+ "[SentanceLen_Freq].csv", "w+", encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def ngram4All(): to_save_folder = "./#Ngram_4[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8") text = fw.read(); words = word_tokenize(text); valid_word = [w for w in words if len(w) > 1 and w != "``"] nlist4 = [] vlen = len(valid_word); for i in range(0,vlen-3): nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3]) fdist = FreqDist(w for w in nlist4) keys = fdist.most_common(len(fdist.keys())) dataFreq = "" for key in keys: dataFreq += str(key[0])+ "," + str(key[1]) + "\n" make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8") writer.write(dataFreq) fw.close() writer.close()
def get_frequency(data_file,all_vocab): input_file = open(data_file, "r") input_file_contents = input_file.read() words = nltk.tokenize.word_tokenize(input_file_contents, 'english') fdist = FreqDist(words) print(fdist) output_file = open("../Training/vocab_freq.txt", "w") for word, frequency in fdist.most_common(4000): if word in all_vocab and word!='+' and word!='-': output_file.write(word + " : " + str(frequency) + "\n") output_file.close() return 1 #data = "data.txt" #stop_words = "stopwords.txt" #accuracy= multinomial_naive_bayes_unigram(data, data, stop_words) #print(accuracy) #print("Separating Done!!")
def wordLenFrequency(): to_save_folder = "./#WordLenFreq[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8"); text = fw.read(); words = word_tokenize(text); freq=[] for word in words: word_len = getWordLen(word); if word_len==20: for char in word: print(char,end=' '); print(word); if word_len>0: freq.append(word_len) fdist = FreqDist(freq); keys = fdist.most_common(); dataFreq = "Word Len,Freqency\n" for key in sorted(keys): dataFreq+= str(key[0])+","+str(key[1]) +"\n"; make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder+ "[WordLen_Freq].csv", "w+", encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def most_frequent_words(path,top): root_path = "./"+path; writers = os.listdir(root_path); word_set = set(); for writer in writers: if writer.find(".") != -1: continue; inside_folder = root_path + "//" +writer; files = os.listdir(inside_folder); formated_text = ""; for file in files: file_path = root_path + "//" +writer+"//"+ file; fw = open(file_path,"r",encoding="utf8"); article = fw.read(); #print(article); formated_text+=" "; formated_text += formatText(article); fw.close(); words = get_bigrams(formated_text); fdist = FreqDist(w for w in words if len(w) > 1 and isEnglish(w) == False and w != "``"); keys = fdist.most_common(top); for key in keys: #print(str(key[0]) + " , " + str(key[1]) + "\n"); word_set.add(key[0]); print(word_set); fw = open("./Features/Bigrams.csv","w",encoding="utf8"); for word in word_set: fw.write(word); fw.write("\n"); fw.close();
def label_clusters(business_id,K,clusters): ''' Label the clusters of a particular run specified by business_id as the most common noun in that cluster ''' base='../Models/%s/Clusters/'%business_id sentence_count=FreqDist(clusters) total_sentences=len(clusters) labels=[] for i in range(0,K): f=open(base+'Cluster_%d'%i,'r') text=f.read().decode('utf-8') f.close() tokens=nltk.word_tokenize(text) tokens = [w for w in tokens if w.isalpha() and len(w) > 3 and w not in stopwords.words()] fd=FreqDist(tokens) frequent=fd.most_common(5) label="None" label_freq=0 for f in frequent: if is_noun(f[0]): label,label_freq=f break relative_score=float(label_freq)/len(tokens) cluster_score=float(sentence_count[i])/total_sentences print "test label:",i,label labels.append((i,label,label_freq,len(tokens),sentence_count[i],total_sentences,relative_score*cluster_score)) return labels
def word_tag_model(words, tagged_words, limit=200): fd = FreqDist(words) cfd = ConditionalFreqDist(tagged_words) most_freq = (word for word, count in fd.most_common(limit)) return dict((word, cfd[word].max()) for word in most_freq)
def trigramAll(): to_save_folder = "./#Trigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8"); text = fw.read(); words = word_tokenize(text); valid_word = [w for w in words if len(w) > 1 and w != "``"]; tri_list = []; vlen = len(valid_word); for i in range(0,vlen-2): tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]); fdist = FreqDist(w for w in tri_list); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n"; make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def experiments(): f = open('classEvent_NEs.txt','r') # Displaying top occurring K NEs text= nltk.Text(f.read().split('\n')) freqd = FreqDist(text) most_common = freqd.most_common(15) # Sorting according to Type of entity for el in most_common: print el
def most_common_bigrams(all_words, num_bigrams): bigram_finder = BigramCollocationFinder.from_words(all_words) bigram_freq = dict(bigram_finder.ngram_fd.viewitems()) for k, v in bigram_freq.items(): if not is_feature_relevant(k[0]) or not is_feature_relevant(k[1]): del bigram_freq[k] fd = FreqDist(bigram_freq) return dict(fd.most_common(num_bigrams)).keys()
def one_by_four(path,top=50): root_path = "./"+path; writers = os.listdir(root_path); word_set = set(); temp_set = set(); writer_table = dict(); for writer in writers: if writer.find(".") != -1: continue; inside_folder = root_path + "//" +writer; files = os.listdir(inside_folder); formated_text = ""; for file in files: file_path = root_path + "//" +writer+"//"+ file; fw = open(file_path,"r",encoding="utf8"); article = fw.read(); #print(article); formated_text+=" "; formated_text += formatText(article); fw.close(); words = getWordList(formated_text); fdist = FreqDist(w for w in words if len(w) > 1 and isEnglish(w) == False and w != "``"); keys = fdist.most_common(top); print(keys); writer_table[writer] = dict(keys); for key in keys: temp_set.add(key[0]); writers = writer_table.keys(); for word in temp_set: for writer1 in writers: freq1 = writer_table[writer1].get(word,0); for writer2 in writers: if writer2 == writer1: continue; freq2 = writer_table[writer2].get(word,0); if freq1 >= freq2*4: print(writer1+" "+writer2+" "+str(freq1)+" " + str(freq2)+ " "+word); word_set.add(word); print(word_set); fw = open("./Features/Modified word frequency.csv","w",encoding="utf8"); for word in word_set: fw.write(word); fw.write("\n"); fw.close();
def unigram_word_feats(self, words, top_n=None, min_freq=0): """ Return most common top_n word features. :param words: a list of words/tokens. :param top_n: number of best words/tokens to use, sorted by frequency. :rtype: list(str) :return: A list of `top_n` words/tokens (with no duplicates) sorted by frequency. """ # Stopwords are not removed unigram_feats_freqs = FreqDist(word for word in words) return [w for w, f in unigram_feats_freqs.most_common(top_n) if unigram_feats_freqs[w] > min_freq]
def lemmatised_words(debate): lemmatised_words = debate fdist = FreqDist(lemmatised_words) list_mc = fdist.most_common(20) most_common_lemmatised_list = [] for list_tuple in list_mc: if list_tuple[1] > 5: most_common_lemmatised_list.append(list_tuple[0].encode('utf-8')) return most_common_lemmatised_list
def most_common_single_features(all_words, num_words): """ Gets the :param num_words most frequent words from the given :param dataset. Does not include stop words :param dataset: :param num_words: :return: list of most common features """ words_in_x = [] for word in all_words: if is_feature_relevant(word): words_in_x.append(word) fd = FreqDist(words_in_x) return dict(fd.most_common(num_words)).keys()
def get_most_frequent_words(indirectory, outfile, count, **args): print (indirectory, outfile, count) corpus=PlaintextCorpusReader(indirectory,'.*\.txt') tokens=corpus.words() fdist=FreqDist(tokens) most_freq=fdist.most_common(count) with open(outfile,'wb') as of: of.write(bytes('<s>\n','UTF-8')) of.write(bytes('</s>\n','UTF-8')) for word,frequency in most_freq: if word.isalpha(): of.write(bytes(word+"\n",'UTF-8'))
def tagsTrend(json_data, number): taglist = [] for item in json_data: if item != "statistics": for t in json_data[item]: for tag in t['tags']: taglist.append(tag) #TODO : USE similarity.findTags instead fdist = FreqDist(taglist) out = [] for x in fdist.most_common(number): if len(x[0]) > 1: out.append(x[0]) return out
def animal_frequency(self, input_list, raw_text): animals = input_list animals = [word.lower() for word in animals] #http://www.nltk.org/howto/stem.html stemmer = nltk.PorterStemmer() raw_text = raw_text singles = [stemmer.stem(word) for word in raw_text] animal_text = [] for word in singles: if word in animals: animal_text.append(word) else: continue fdist1 = FreqDist(animal_text) top_50 = fdist1.most_common(50) return top_50
def get_words(tweets): cleaned_listings = [] words1 = [] for tweet in tweets: for i, listing in enumerate(tweets): tokens = word_tokenize(listing) lowercase_tokens = [w.lower() for w in tokens] no_punctuation = [x.translate(table) for x in lowercase_tokens] alphabetic = [word for word in no_punctuation if word.isalpha()] words = [w for w in alphabetic if not w in stop_words] cleaned_listings.extend(words) fdist = FreqDist(cleaned_listings) for word, frequency in fdist.most_common(5): result = ('{} - {}'.format(word, frequency)) words1.append(result) return (words1)
def get_label_preferences(self, cluster_key): def exists(label_tree, key): return key in label_tree.nodes n_grams = self.cluster_processed_df[self.cluster_processed_df.labels_tree.apply(lambda tree: exists(tree, cluster_key))].n_grams_emb if len(n_grams) == 0: logger.info('Error: ' + cluster_key + ' does not exist in label graph.') return label_name_distr = FreqDist() for n_gram in n_grams: label = self._apply_positions(cluster_key, n_gram) label_name_distr[label] += 1 return label_name_distr.most_common()
def stemmed_snowball(debate): snowball = SnowballStemmer("english") stemmed_words_snowball = [snowball.stem(w) for w in debate] fdist2 = FreqDist(stemmed_words_snowball) list_mc2 = fdist2.most_common(20) #adjust this too!!! most_common_snowball = [] for list_tuple in list_mc2: if list_tuple[1] > 5: most_common_snowball.append(list_tuple[0].encode('utf-8')) return most_common_snowball
def train(self, bookset): self.agg = AgglomerativeClustering(n_clusters = len(bookset)) bookX = [] for b in bookset: databook = ngrams(b, self.gramn) fdist = FreqDist(databook) common = fdist.most_common(100) inputlist = [] for c in common: inputlist.append(c[0]) inputlist.append(c[1]) bookX.append(inputlist) self.agg.fit(bookX)
def metrics(self): fdist = FreqDist(self.token_clean) top_word = fdist.most_common(10) idx = text.ContextIndex(self.token) list_similarity = [] for word in self.token: list_similarity.append(idx.similar_words(word)) print( '\nMetricas\n' + '# Palabras Originales: ' + str(len(self.newcorpus)) + '\n' + '# Palabras Limpias: ' + str(len(self.newcorpus_clean_token)) + '\n' + 'Dif Palabras: ' + str(len(self.token) - len(self.token_clean)) + '\n' + 'Concordance: ' + str(Text(self.token).concordance('sancho')) + '\n' + 'Similarity:', list_similarity, '\nSet de palabras: ', set(self.token), '\nTop 10 Freq Plabras: ', top_word)
def unigram_word_feats(self, words, top_n=None, min_freq=0): """ Return most common top_n word features. :param words: a list of words/tokens. :param top_n: number of best words/tokens to use, sorted by frequency. :rtype: list(str) :return: A list of `top_n` words/tokens (with no duplicates) sorted by frequency. """ # Stopwords are not removed unigram_feats_freqs = FreqDist(word for word in words) return [ w for w, f in unigram_feats_freqs.most_common(top_n) if unigram_feats_freqs[w] > min_freq ]
def create_vectorizers(data_dict): topic_list = list(data_dict.keys()) vectorizer_dict = {} for topic in topic_list: text_array = data_dict[topic] text = " ".join(text_array) word_list = tokenize_nltk(text) word_list = [word for word in word_list if word not in stopwords] freq_dist = FreqDist(word_list) top_200 = freq_dist.most_common(200) vocab = [ wtuple[0] for wtuple in top_200 if wtuple[0] not in stopwords and wtuple[0] not in string.punctuation ] vectorizer_dict[topic] = CountVectorizer(vocabulary=vocab) return vectorizer_dict
def extract_most_frequent_words(fileids, num_most_frequent): ''' Function to extract the most frequent words from a corpus. Args: fileids: file ids for the documents in the reuters corpus. num_most_frequent: Number of most frequent words the user wish to compute. ''' fdist = FreqDist() for fileid in fileids: for word in reuters.words(fileid): fdist[word.lower()] += 1 most_frequent_words = [k for k, v in fdist.most_common(num_most_frequent) if k.isalpha() and len(k) > 2 and k not in MOST_COMMON_WORDS_ENGLISH] return most_frequent_words
def get_stop_words_1(data, num_stop_words): total_words = [] for d in data: total_words.extend(d["ques"]) total_words.extend(d["answer1"]) for d_i in d["summary"]: total_words.extend(d_i) fdist = FreqDist(total_words) stop_words = fdist.most_common(num_stop_words) stop_words = [t[0] for t in stop_words] pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] filtered_stop_words = [] for p in stop_words: if p not in pronoun_list: filtered_stop_words.append(p) return filtered_stop_words
def correct_spell_word(incorrect_word, dict_byword): main_word = incorrect_word size = len(main_word) all_word_condid = [] for i in range(size - 1): bigram = main_word[0] + main_word[1] if dict_byword.__contains__(bigram): temp = dict_byword.get(bigram) for x in temp: all_word_condid.append(x) main_word = main_word[1:] fdist = FreqDist(all_word_condid) most_common = dict(fdist.most_common(30)) contain_jacard = {} for x in most_common: number_bigram = len(x) - 1 jacard = most_common.get(x) / (number_bigram + size - most_common.get(x)) contain_jacard.update({x: jacard}) sort_jacard = { k: v for k, v in sorted( contain_jacard.items(), key=lambda item: item[1], reverse=True) } # print("jaccard : ",incorrect_word,sort_jacard) min1 = math.inf editDistanceDict = {} correct_word = "" for x in sort_jacard: ED = editDistance(x, incorrect_word, len(x), len(incorrect_word)) editDistanceDict.update({x: ED}) editDistanceDict_sort = { k: v for k, v in sorted(editDistanceDict.items(), key=lambda item: item[1]) } minval = min(editDistanceDict_sort.values()) correct_words = [ k for k, v in editDistanceDict_sort.items() if v == minval ] return correct_words
def sentiment_ana_exwords(): sen='' sid = SentimentIntensityAnalyzer() df=pd.read_csv("pkm-19-clean.csv") for i in range(20,27): df=df.append(pd.read_csv(f'pkm-{str(i)}-clean.csv'),ignore_index=True) for i in df['text']: ss=sid.polarity_scores(i) if ss['compound']>-0.05 and ss['compound']<0.05: sen+=i sen=sen.lower() toker=RegexpTokenizer(r'\w+') words=toker.tokenize(sen) stop_words = set(stopwords.words('english')) filtered_sentence = [w for w in words if not w in stop_words] fdist=FreqDist(filtered_sentence) print(fdist.most_common(50))
def termFreq(text, options): # Stop Words swords = set(stopwords.words(options['language'])) table = str.maketrans(dict.fromkeys(string.punctuation)) cleanText = text.translate(table) # Change to lower case and tokenize, exclude punctuation and stopwords words = [ i.lower() for i in wordpunct_tokenize(cleanText) if i.lower() not in swords ] fdist = FreqDist(words) result = [list(i) for i in fdist.most_common(options['termLimit'])] return result
def evaluate(row): record, stop_words = row text = nltk.word_tokenize(record['text']) #stemming stemmer = SnowballStemmer("english") for indx, word in enumerate(text): word = stemmer.stem(word) text[indx] = word fdist = FreqDist(text) unfiltered_frequencies = fdist.most_common(fdist.B()) frequencies = [ t for t in unfiltered_frequencies if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2 ] return frequencies
def summarize(self): tokens = tokenize(self.text) entities = self.ner.extract_entities(tokens) entities_text = list() for e in entities: range = e[0] tag = e[1] score = e[2] score_text = "{:0.3f}".format(score) entity_text = " ".join(tokens[i].decode() for i in range) # print(" Score: " + score_text + ": " + tag + ": " + entity_text) entities_text.append(entity_text) frequency_distribution = FreqDist(entities_text) return frequency_distribution.most_common(5)
def my_unigram(words): freq = FreqDist(words) freqDistr = freq.most_common(50) for i in range(len(freqDistr)): freqDistr[i] = list(freqDistr[i]) freqDistr[i][1] = freqDistr[i][1] + 1 rowSum = 0 for i in range(len(freqDistr)): rowSum = rowSum + freqDistr[i][1] uni = [] for i in range(len(freqDistr)): uni.append( [freqDistr[i][0], (-1 * numpy.log(freqDistr[i][1] / rowSum))]) # print(uni) return uni
def plot_iz(): pd_data = pd.read_csv(DATA_PATH + 'iz.csv') tokens = pd_data['text'].apply(str.lower).apply( nltk.tokenize.word_tokenize) all_tokens = [token for text in tokens for token in text] fd = FreqDist(all_tokens) data = {x[0]: x[1] for x in fd.most_common(10)} plt.xticks(rotation='vertical') print(list(data.keys())) print(list(data.values())) plt.bar(list(data.keys()), list(data.values()), 1, edgecolor='w', color='g') plt.savefig('static/plot.png') return render_template('iz.html')
def AccessTwitter(search_string): key = configurations.consumer_key secret = configurations.consumer_secret access_token = configurations.access_token access_secret = configurations.access_secret auth = tweepy.OAuthHandler(consumer_key=key, consumer_secret=secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth) tweets = [] for tweet in api.search(q=search_string, count=100, lang="en"): tweets.append(tweet) data = pd.DataFrame(data=[(tweet.text) for tweet in tweets], columns=['Tweets']) data['clean_tweets'] = data['Tweets'].apply(lambda x: cleanTweet(x)) data['subjectivity'] = data['clean_tweets'].apply(getsubjectivity) data['polarity'] = data['clean_tweets'].apply(getpolarity) data['analysis'] = data['polarity'].apply(getanalysis) all_words = ' '.join(tweet for tweet in data['clean_tweets']) allwords = word_tokenize(all_words) stop_words = set(stopwords.words("english")) filtered_sent = [] for w in allwords: if w not in stop_words: filtered_sent.append(w) fdist = FreqDist(filtered_sent) fd = pd.DataFrame(fdist.most_common(10), columns=["Word", "Frequency"]).drop([0]).reindex() # sentiment bar sentiment_bar = data['analysis'] # sentiment pie sentiment_pie = data['analysis'] # word cloud word_cloud = WordCloud(width=800, height=400, random_state=1, max_font_size=120).generate(all_words) # frequently used words n = fd['Word'] s = fd['Frequency'] return (sentiment_bar, sentiment_pie, word_cloud, n, s)
def get_trends(tag): print(tag) tkns_CE='' with open('tweets.txt','r', encoding='UTF8') as file: for line in file: pat = re.compile(tag) if pat.search(line.lower()) != None: #print(line) seg = re.sub(r'[^\x00-\x7f]+',r' ',line) seg = re.sub(r'[^a-zA-Z0-9_\s]+', '', seg) #print(seg) #tokens = word_tokenize(seg) #stopw = [i for i in tokens if not i in stop_words] result = TextBlob(seg.lower()) #print(result.tags) for word, pos in result.tags: if pos == 'JJ': if (sid.polarity_scores(word)['compound']) >= 0.5: #print(word) tkns_CE += ' ' + word tkns_CE = tkns_CE.split() #print(tkns) for i in range(len(tkns_CE)): #print(tkns_CE[i]_ for syns in wordnet.synsets(tkns_CE[i]): if syns: lemmas = syns.lemmas() for l in lemmas: #syns = wordnet.synsets(tkns_CE1[i], pos = 'a') #print(tkns_CE1[i]) #if syns: for j in range(i,len(tkns_CE)): #print(tkns_CE1[j]) syns1 = wordnet.synsets(tkns_CE[j]) syns2 = wordnet.synsets(l.name()) if syns1: sim = syns2[0].wup_similarity(syns1[0]) if sim: if sim > 0.4: #print(tkns_CE[j],tkns_CE[i]) tkns_CE[j] = tkns_CE[i] fdist = FreqDist(tkns_CE) #print(fdist) print(fdist.most_common(10))
def frequencyDis(str1): words = word_tokenize(str1) words_no_punc = [] for w in words: if w.isalpha(): words_no_punc.append(w.lower()) notwords = stopwords.words("english") clean_words = [] for w in words_no_punc: if w not in notwords: clean_words.append(w) freqDist3 = FreqDist(clean_words) return freqDist3.most_common(10)
def assignment3(): drop_words = [] word_list = nltk.corpus.gutenberg.words('melville-moby_dick.txt') #print(word_list) fd = FreqDist(word_list) most_common = sorted(fd.most_common(100)) # the need for this extra step is because one would like to have dropwords that have a high frequency. If you have a dropword that only occurs once, then it is not worth it to drop it # Without this however you could simply create the dropwords from word_list instead of creating an fd of the most_common for word in most_common: if len(word[0]) <= 3: drop_words.append(word[0]) clean = [word for word in word_list if word not in drop_words] new_fd = FreqDist(clean) return new_fd
def build_vocabulary(data_file): input_file = open(data_file, "r") input_file_contents = input_file.read() words = nltk.tokenize.word_tokenize(input_file_contents, 'english') fdist = FreqDist(words) print(fdist) # print(fdist.most_common(2000)) output_file = open("../Vocabulary/vocabulary.txt", "w") for word, frequency in fdist.most_common(4000): if frequency >= 2 and word!='+' and word!='-': output_file.write(word + "\n") output_file.close() return 1
def filter_sentences(sentences, fraction_words_to_use=1): stops = set(stopwords.words("english")).union(string.punctuation) stemmer = PorterStemmer() for i in range(len(sentences)): sentences[i] = word_tokenize(sentences[i]) sentences[i] = [stemmer.stem(word) for word in sentences[i] if word not in stops] words = list(itertools.chain.from_iterable(sentences)) freq_dist = FreqDist(words) target_num = int(freq_dist.B() * fraction_words_to_use) targets = freq_dist.most_common(target_num) targets = [target[0] for target in targets] for i in range(len(sentences)): sentences[i] = [word for word in sentences[i] if word in targets] return sentences
def CalculateBestWords(corpus): # Create frequency distributions for later word_fd = FreqDist() label_word_fd = ConditionalFreqDist() # For each document in the corpus for document in corpus: # Split out of the words from the label words = document[0] label = document[1] # For each word in the document for word in words: # Split off the word and frequency token, frequency = word.split(":") # Add the word to the distribution equal to the number of times it # occurs in the document for i in range(int(frequency)): word_fd[token.lower()] += 1 label_word_fd[label][token.lower()] += 1 # Figures out the number of words that apply to each label pos_word_count = label_word_fd['positive'].N() neg_word_count = label_word_fd['negative'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} # This computes the probability that a word is in a given class, for each class for word, freq in word_fd.most_common(word_fd.N()): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # This sorts the list of words by their score and retrieves the 5000 best words best = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)[:5000] best_words = set([w for w, s in best]) return best_words
def home(): if (request.method == 'POST'): textName = request.form['textName'] text = request.form['text'] devideSentences = request.form.get('devideSentences') freqDist = request.form.get('freqDist') number = 0 if freqDist != None: number = request.form['number'] sentences = sent_tokenize(text) clean_sentences = [] if (devideSentences == "on"): for num, sent in enumerate(sentences): clear_sentence = tokenizer.tokenize(sent) clear_sentence_str = ' '.join( [str(elem) for elem in clear_sentence]) clean_sentences.append(clear_sentence_str) freq_dist = '' if (freqDist == "on"): clean = [] filtered_words = [] for num, sent in enumerate(sentences): clear_sentence = tokenizer.tokenize(sent) clean.append(clear_sentence) for sent in clean: for word in sent: words = lem.lemmatize(word, "v") if words not in stopWords: filtered_words.append(word) dist = FreqDist(filtered_words) freq_dist = dist.most_common(int(number)) return render_template('analyse.html', textName=textName, text=text, clean_sentences=clean_sentences, freq_dist=freq_dist) return render_template('home.html')
def mostFrequentWords(filePath): import re import json import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize clean(filePath) stop_words = set(stopwords.words('english')) stop_words_new = ['haha', 'https', 'hahaha', 'u', 'btw','dr', 'pm', 'am', 'like','lol','one','na','yeah', "a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero","a",",",".","?","!","|",":","'",";","<NUM>","?","$","km","s","u","&","#","'s","/","dr."] for s in stop_words_new: stop_words.add(s) with open('new.json') as f: data = json.load(f) allText = "" for key in data: text = data[key]['text'] text = re.sub(r'\W+', ' ', text) line = re.sub(r"(^|\W)\d+", "", text) allText = allText + " "+ text word_tokens = word_tokenize(allText) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: w = w.lower() if w not in stop_words: filtered_sentence.append(w) # print(word_tokens) # print(filtered_sentence) from nltk.probability import FreqDist freq = FreqDist(filtered_sentence) for w in freq.most_common(10): print ("word: " + w[0] + " frequency: " + str(w[1])) print("***") freq.plot(10,cumulative=False)
def _make_capped_word_index(stanford, dataset, vocab_size=20000): len_stanford = len(stanford.vocab) print("Stanford vocab original length:", len_stanford) print("Capped vocab (20k) fraction:", vocab_size / len_stanford) # Combine all tweets into one big array of words flatten = lambda l: [item for sublist in l for item in sublist] dataset_corpus = flatten(dataset["train_tweets"]) # Count most frequent words fd = FreqDist(dataset_corpus) top20k = fd.most_common(vocab_size) # Make new word index (vocab) with PAD and UNK as the first two tokens (ordering is important) top20k_words = [PAD, UNK] top20k_words += [x[0] for x in top20k] word_index_20k = {word: idx for idx, word in enumerate(top20k_words)} return word_index_20k
def __init__(self, file): self.tokenized_sentences = [] #Opening file and replacing carriage return by space brexit_text = file.read().replace('\n', ' ') #Initializing tokenizer tokenizer = RegexpTokenizer(r'\w+') #Initializing Stemmer (not supported yet) ps = PorterStemmer() #Tokenizing sentences tokenized_words = tokenizer.tokenize(brexit_text.lower()) self.sentences = nltk.sent_tokenize(brexit_text.lower()) filtered_tokenized_words = [] filtered_tokenized_sentences = [] #Removing stopwords from text for word in tokenized_words: if word not in stopwords.words('english'): filtered_tokenized_words.append(ps.stem(word)) for b_sentence in self.sentences: filtered_tokenized_sentences.append([ ps.stem(word) for word in tokenizer.tokenize(b_sentence) if word not in stopwords.words('english') ]) #Creating the fdist dictionnary fdist_words = FreqDist(filtered_tokenized_words) self.fdist_dict = dict(fdist_words.most_common(fdist_words.N())) for k in self.fdist_dict.keys(): print(k + "," + str(self.fdist_dict[k])) i = 0 for filtered_sentence in filtered_tokenized_sentences: self.tokenized_sentences.append( sentence( sorted(filtered_sentence, key=lambda x: self.fdist_dict.get(x), reverse=True), i, file)) i += 1
def get_word_frequency(self, size): file_name = self.disease_type + '-word-freq-' + str(size) if 'training' in file_name: full_training_word_freq_filename = file_name + '.csv' file = csv.writer(open(full_training_word_freq_filename, 'w')) else: full_test_word_freq_filename = file_name + '.csv' file = csv.writer(open(full_test_word_freq_filename, 'w')) fd = FreqDist(self.word_set) #print fd.most_common(200) #print fd.hapaxes() # fd.plot(50,cumulative=False) # Print word counts to a CSV file for key, count in fd.most_common(size): file.writerow([key.encode('utf-8'), count]) # encode return self.full_training_word_freq_filename, self.full_test_word_freq_filename
def parseDataSet(): file1 = open("text.txt", "r") # Open file print("... file opened") data = file1.read() # Read & assign entire dataset # data = file1.read(2000) # Read & assign first 2000 chars of dataset print("... file read") # --TOKENIZE-- tokenized_word = word_tokenize(data) # Split data into word tokens print("... file tokenized") # --REMOVE STOPWORDS-- stopWords = set(stopwords.words('english')) addWords = ['I', 'Im', 'Its', 'bc', 'www', 'http', 'com', ','] # Additional meaningless words filteredData = [] forbiddenBar = re.compile('\\|\\|\\|') # regex for ||| for word in tokenized_word: # Filter stop words from token set if not word in stopWords and not word in addWords: if not word.startswith("http") and not forbiddenBar.search(word): filteredData.append(word) print("... stopwords removed") # --STEMMING-- ps = PorterStemmer() stemmedData = [] for word in filteredData: stemmedData.append(ps.stem(word)) print("... data stemmed") # TODO: --LEMMATIZATION-- # TODO: --POS TAGGING-- fdist = FreqDist(filteredData) # Pass token set and return distribution print(fdist.most_common(50)) file1.close() # Close file print("... file closed") commonWords(fdist)
class VocabBuilder: """ Creates a vocabulary after scanning a corpus. """ def __init__(self, lang="english", min_length=3, cut_first=100): """ Set the minimum length of words and which stopword list (by language) to use. """ self._counts = FreqDist() self._stop = set(stopwords.words(lang)) self._min_length = min_length self._cut_first = cut_first print(("Using stopwords: %s ... " % " ".join(list(self._stop)[:10]))) def scan(self, words): """ Add a list of words as observed. """ for ii in [x.lower() for x in words if x.lower() not in self._stop \ and len(x) >= self._min_length]: self._counts[ii] += 1 def vocab(self, size=5000): """ Return a list of the top words sorted by frequency. """ sorted_list = self._counts.most_common(len(self._counts)) count_list = [] for i in range(size): if i == len(self._counts): break if len(self._counts) > self._cut_first + size: count_list.append(sorted_list[self._cut_first + i][0]) else: count_list.append(sorted_list[i][0]) return count_list
def pnl_module(df): content = ' '.join(df["text"]) content = re.sub(r"http\S+", "", content) content = content.replace('RT ', ' ').replace('&', 'and') content = re.sub('[^A-Za-z0-9]+', ' ', content) content = content.lower() tokenized_word = word_tokenize(content) #Extra fine filter tokenized_word = [word for word in tokenized_word if len(word) > 3] stop_words = set(stopwords.words("spanish")) filtered_sent = [] for w in tokenized_word: if w not in stop_words: filtered_sent.append(w) fdist = FreqDist(filtered_sent) fd = pd.DataFrame(fdist.most_common(15), columns=["Word", "Frequency"]).drop([0]).reindex() return fd
def get_features(self, topk): # Good vocab size for just twitter data is 2800 fdist = FreqDist() for line in self.input: for word in word_tokenize(line): fdist[word] += 1 print(len(fdist)) common = fdist.most_common(topk) vocab = {} i = 0 for pair in common: vocab[pair[0]] = i i += 1 vocab["UNK"] = i return vocab
def unigramAGivenFolder(folder): to_save_folder = "./#Unigram[.]/" folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8"); text = fw.read(); print(len(text)) #text = text[:10000]; words = word_tokenize(text); fdist = FreqDist(w for w in words if len(w) > 1 and w != "``"); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq += str(key[0]) + " , " + str(key[1]) + "\n"; make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[unigram].csv", "w+", encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def get_summarized(self, input, num_sentences ): # TODO: allow the caller to specify the tokenizer they want # TODO: allow the user to specify the sentence tokenizer they want tokenizer = RegexpTokenizer('\w+') # get the frequency of each word in the input base_words = [word.lower() for word in tokenizer.tokenize(input)] words = [word for word in base_words if word not in stopwords.words()] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in word_frequencies.most_common(100)] # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(input) working_sentences = [sentence.lower() for sentence in actual_sentences] # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # sort the output sentences back to their original order return self.reorder_sentences(output_sentences, input)
meaningful_words = [w for w in token_noun if not w in stops] pat_bigrams=bigrams(meaningful_words) pat_bigrams=[' '.join(x) for x in pat_bigrams] counter_final_words=Counter(pat_bigrams) string_words=' '.join(meaningful_words) return counter_final_words, string_words #Counter function creates a TDM for each corpora counter_1, words_1=parse_abstracts(name1+'_uspc_2012.csv') counter_2, words_2=parse_abstracts(name2+'_uspc_2012.csv') #collac_1=words_1.collocations() #print(collac_1) fdist1=FreqDist(counter_1) fdist1_top50=fdist1.most_common(50) print('\nTop 50 Bigram List for',name1) print(fdist1_top50) #fdist1.plot(25, cumulative=False) fdist2=FreqDist(counter_2) fdist2_top50=fdist2.most_common(50) print('\nTop 50 BigramList for',name2) print(fdist2_top50) #all_items creates a set of all unique words used in both counters all_items=set() all_items=set(counter_1.keys()).union( set(counter_2.keys()) ) #Create a vector of the counts of all words in each corpora vector_1=[counter_1[k] for k in all_items]
# Predicting the Test set results y_pred = classifier.predict(X_test) # Accuracy from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, y_pred) * 100 # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Frequency of words from nltk.probability import FreqDist import string words = dataset.iloc[:, 0].values fdist = {} a = ''.join(words) b = ''.join(['' if ch in string.punctuation else ch for ch in a]) tokens = nltk.tokenize.word_tokenize(b) fdist = FreqDist(tokens) mostWords = fdist.most_common(100) #tokens = nltk.word_tokenize(words) #fdist=FreqDist(tokens) #for sentence in nltk.tokenize.sent_tokenize(tokenized_sents): # for i in nltk.tokenize.word_tokenize(sentence): # fdist[i] += 1 #for i in tokenized_sents: # fdist[i].append(i) # tokenized_sents = [nltk.word_tokenize(i) for i in words]
def frequencyDistribution(): t = word_tokenize(text) freq = FreqDist(t) print freq.most_common(50)
def GET(self, jsoninput): wordlist = word_tokenize(jsoninput) wordlist = filter(lambda a: a != '.' and a != ',' and a != '?' and a !='!', wordlist) fd = FreqDist(wordlist) mostcom = fd.most_common(10) return jsonify('mostfreq', mostcom)
stop_words = nltk.corpus.stopwords.words('spanish') non_alphabetic = re.compile("\W|\d") words = [] tags = [] # Using TreeTagger # 1) pip install treetaggerwrapper # 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger # 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib # See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings tagger = treetaggerwrapper.TreeTagger(TAGLANG='es') for sentence in article_corpus.sents(): tagged_sentence = tagger.tag_text(sentence) tags.extend(treetaggerwrapper.make_tags(tagged_sentence)) #TODO: create a tagger script, save the tagged files #TODO: look at alternate taggers, compare #TODO: profile this and see which part is taking so long for tag in tags: lemma = tag[2].lower() if lemma not in stop_words and not non_alphabetic.search(lemma): words.append(lemma) freq_dist = FreqDist(words) with open('./frequency_distribution.txt', 'w', encoding='utf-8') as f: f.write("word, number of occurences\n") for word in freq_dist.most_common(): f.write(word[0] + ", " + str(word[1]) + "\n")