def filter_words(words): new_words = FreqDist(words) stopwords = get_stop_words('ar') keys = new_words.keys() for word in keys: if word in stopwords: new_words.pop(word) if len(word) <= 2: new_words.pop(word) return new_words
def worst_errors_many_wrong_decisions(self, k, feature_extractor): worst_errors = [] features = [] wrongDocs = self.error_prediction_docs(self.maintest, self.testClassify) for doc in wrongDocs: feature_dic = feature_extractor(movie_reviews.words(fileids=[doc])) features = features + feature_dic.keys() fd = FreqDist(feature.lower() for feature in features) for i in range(1, k+1): x = fd.max() fd.pop(x) worst_errors.append(x) return worst_errors
def worst_errors_many_wrong_decisions(self, k, feature_extractor): worst_errors = [] features = [] wrongDocs = self.error_prediction_docs(self.maintest, self.testClassify) for doc in wrongDocs: feature_dic = feature_extractor(movie_reviews.words(fileids=[doc])) features = features + feature_dic.keys() fd = FreqDist(feature.lower() for feature in features) for i in range(1, k + 1): x = fd.max() fd.pop(x) worst_errors.append(x) return worst_errors
def word_count(text, exclude_inputlist): frequency = FreqDist(wd.lower() for wd in text if wd.isalpha()) excludelist = stopwords.words('english') + exclude_inputlist for word in frequency.keys(): if word in excludelist or frequency[word] < 2 or not word.isalpha(): frequency.pop(word) return frequency
# removing numeric digits from list of words filteredStopwords = [i for i in filteredStopwords if not i.isdigit()] freqDist = FreqDist(filteredStopwords) print("In HHBD Hindi Bible") print(f"प्रेम appears for {freqDist['प्रेम']} times") print(f"डर appears for {freqDist['डर']} times") checkWords = ["यीशु", "मसीह", "उद्धारकर्ता", "उद्धार", "क्रूस"] checkWordFreq = {} for checkWord in checkWords: checkWordFreq[checkWord] = freqDist[checkWord] print(checkWordFreq) freqDist.pop("राजा", None) sents = [] for i in text.split("॥"): sents.append(i.split("।")) sents = [item for sublist in sents for item in sublist] from collections import defaultdict ranking = defaultdict(int) for i, sent in enumerate(sents): for token in tokenizeWord(sent): if token in freqDist: ranking[i] += freqDist[token] from heapq import nlargest
for x, y in itertools.zip_longest(f_list, f_list2, fillvalue=(0, 0)): rank_tup = tuple(['Rank:' + str(rank)]) new_element = tuple([rank_tup + x]) if x[1] > y[1]: ranked_list += new_element rank += 1 elif x[1] == y[1]: ranked_list += new_element return ranked_list print('Ranked frequency distribution in descending order of frequency', ranked_freq_dist(fdist)) # Removing stop words and punctuations from the words list. filtered_words = [w for w in words if not w in stop_words and w.isalnum()] # Creating a FreqDist object from the filtered list. filtered_fdist = FreqDist(filtered_words) # Plotting the top 10 words. I didnt plot the top 50 as the graph gets very hard to read on a small screen. #filtered_fdist.plot(10, title='Frequency Distribution') # Checking the number of occurences of the words 'America' and 'world'. print('Occurences of \'America\':', filtered_fdist.pop('America'), '\nOccurences of \'world\':', filtered_fdist.pop('world'))