def get_summarized(self, inputContent, num_sentences ): base_words = [word.lower() for word in nltk.word_tokenize(inputContent)] words = [word for word in base_words if word not in stopwords.words()] word_frequencies = FreqDist(words) most_frequent_words = [pair[0] for pair in word_frequencies.items()] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(inputContent) working_sentences = [sentence.lower() for sentence in actual_sentences] output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break return output_sentences
def summarize(self, input, num_sentences): punt_list = [".", ",", "!", "?"] summ_sentences = [] sentences = sent_tokenize(input) lowercase_sentences = [sentence.lower() for sentence in sentences] # print lowercase_sentences s = list(input) ts = "".join([o for o in s if not o in punt_list]).split() lowercase_words = [word.lower() for word in ts] words = [word for word in lowercase_words if word not in stopwords.words()] word_frequencies = FreqDist(words) most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # add sentences with the most frequent words for word in most_frequent_words: for i in range(0, len(lowercase_sentences)): if len(summ_sentences) < num_sentences: if lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]: summ_sentences.append(sentences[i]) break # reorder the selected sentences summ_sentences.sort(lambda s1, s2: input.find(s1) - input.find(s2)) return " ".join(summ_sentences)
def get_negative_grams(filePath,n): l = list() #Open the file and write on it the result with codecs.open(filePath,'r') as myfile: sentence=myfile.read() sentence=sentence.replace('points forts', ' ') sentence=sentence.replace('points faibles', ' ') sentence=sentence.replace('commentaires', ' ') n_grams = ngrams(sentence.split(), n) s='' for grams in n_grams: if('est pas' in grams or 'ai pas' in grams or 'pas' in grams or 'cher' in grams): s+=str(grams)+'\n' l.append(grams) '''fe = open('negative-'+str(n)+'-gram.txt', 'w') fe.write(s) fe.close()''' Dict = FreqDist(l) Dict = sorted(Dict.items(), key=operator.itemgetter(1), reverse=True) t='' for x in Dict: t+= '(\''+str(x[0])+'\' , ' +str(x[1])+')\n' fe = open('stats/Freq_negative-'+str(n)+'-gram.txt', 'w') fe.write(t) fe.close()
def getBestWords(posWords, negWords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 label_word_fd["pos"][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 label_word_fd["neg"][word.lower()] += 1 pos_word_count = label_word_fd["pos"].N() neg_word_count = label_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True) bestwords = set([w for w, s in sorted_x]) return bestwords
def bysegment(db): dist = FreqDist() total = 0 while db.hasNext(): fragments = db.nextPwd() pwd = fragments[0].password for f in fragments: # iterate through fragments total += 1 if total % 100000 == 0: print "{} segments processed...".format(total) if f.is_gap(): dist.inc("gap") else: raw_word = pwd[f.s_index:f.e_index] if raw_word.isupper(): dist.inc('upper') elif raw_word.istitle(): dist.inc('capitalized') elif raw_word.islower(): dist.inc('lower') else: dist.inc('mangled') for k, v in dist.items(): print "{}\t{}".format(k, v)
def train_MLT(self, tagged_train_data, untagged_training_data): """ Builds a most likely tag tagger from the given tagged training data as WORDS :param train_data: :return: model """ # find the set of words words = set() for sent in untagged_training_data: for word in sent: words.add(word) # Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........} mlt_dict = dict() # Initialize keys and values to it for word in words: mlt_dict[word] = dict() # Compute the freq dist of tagged words tagged_words_fdist = FreqDist(tagged_train_data) for tagged_word, count in tagged_words_fdist.items(): (mlt_dict[tagged_word[0]])[tagged_word] = count # Update the dict to contain the most likely tag for each word #for word, inside_dict in mlt_dict.items(): # max_val = max(inside_dict.values()) # inside_dict = print("Training is done!") return mlt_dict
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos = 0 neg = 0 for review in posids: pos += 1 if (pos != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['pos'].update(token_helpers.tokenize_simple(word)) for review in negids: neg += 1 if (neg != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['neg'].update(token_helpers.tokenize_simple(word)) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords """
def make_summary( text): sent = [] stemmed = [] tokens = word_tokenize(text) sent = sent_tokenize(text) for token in tokens: if token in stopwords.words('english'): tokens.remove(token) stemmer = PorterStemmer() for token in tokens: stemmed.append(stemmer.stem(token)) #freq(stemmed) for word in stemmed: word.lower() word_freq = FreqDist(stemmed) most_freq_words = [pair[0] for pair in word_freq.items()[:60]] working_sent = [sentence.lower() for sentence in sent] out_sent = [] for word in most_freq_words: for i in range(0,len(working_sent)): if (word in working_sent[i] and sent[i] not in out_sent): out_sent.append(sent[i]) break if len(out_sent) >= 5: break if len(out_sent) >= 5: break return reorder(out_sent,text)
def create_word_scores(): posWords = pickle.load(open('pos_review.pkl', 'rb')) negWords = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) # 同理 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量
def create_word_bigram_scores(): posdata = pickle.load(open('pos_review.pkl', 'rb')) negdata = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder_pos = BigramCollocationFinder.from_words(posWords) bigram_finder_neg = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams # 词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def summarize(self, input, num_sentences ): s=[] punt_list=['.',',','!','?'] summ_sentences = [] sentences=input #sentences = sent_tokenize(input) lowercase_sentences =[sentence.lower() for sentence in sentences] #print lowercase_sentences saito=' '.join(sentences) s=input ts=''.join([ o for o in s if not o in punt_list ]).split() lowercase_words=[word.lower() for word in ts] words = [word for word in lowercase_words if word not in stopwords.words()] word_frequencies = FreqDist(words) most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # add sentences with the most frequent words if(len(s) < num_sentences): num_sentences=len(s) for word in most_frequent_words: for i in range(len(lowercase_sentences)): if len(summ_sentences) < num_sentences: if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]): summ_sentences.append(lowercase_sentences[i]) else: break if len(summ_sentences) >= num_sentences: break # reorder the selected sentences summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) ) return summ_sentences
def create_word_bigram_scores(posWords, negWords, n = 5000): # (posWords,negWords) = readwordarr() posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) bigramfinder = BigramCollocationFinder.from_words(posWords) posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n) bigramfinder = BigramCollocationFinder.from_words(negWords) negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n) posWords = posWords + posbigrams negWords = negWords + negbigrams wordscores = {} wordfd = FreqDist() conditionwordfd = ConditionalFreqDist() for word in posWords: wordfd[word]+=1 conditionwordfd['pos'][word]+=1 for word in negWords: wordfd[word]+=1 conditionwordfd['neg'][word]+=1 pos_word_count = conditionwordfd['pos'].N() neg_word_count = conditionwordfd['neg'].N() totalcount = pos_word_count + neg_word_count for word,freq in wordfd.items(): pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount) neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount) wordscores[word] = pos_score + neg_score return wordscores
def get_words_frequency(string, top_values): """ Gets the words frequency in a corpus :param string: corpus :param top_values: maximum of sorted values to return :return: list of frequencies of the word in there synset form """ # import stop words from nltk corpus stop_words_en_nltk = list(stopwords.words('english')) # create additional stop words for puntuations and others stop_words_en_custom = ['.', ',', '\'', '!', '(', ')', ':', ';', '?', '--', '*', '[', ']', '``', str("''"), '&', '\'ll', '\'ve', '\'s', '\'re', 'a', 'b', 'c', 'i', '\'i', 'this', 'n\'t', 'a', 'could', 'should', 'would', 'can', 'will', 'shall', 'there', 'it', 'also', 'in', 'the', 'many', 'by', 'an', '1990s', 'the', '+', '-', '...', '=', '%', '#', '[hide]', '[edit]', '.jpg', '/', 'be.v.01', 'have.v.01', 'use.v.01', 'besides.r.02', 'analysis.n.01', 'categorization.n.03', 'vitamin_e.n.01', 'vitamin_c.n.01', 'include.v.01', 'such.s.01', 'many.a.01', 'order.n.01', 'episode.n.01', 'show.n.01', 'not.r.01', 'standard.n.01', 'survey.n.01', 'factor.n.01', 'first.a.01'] until_number = 300 stop_words_en_custom_numbers = [] for value in [lambda i=i: i for i in range(until_number+1)]: stop_words_en_custom_numbers.append(str(value())) # add them together stop_words_en = stop_words_en_nltk + stop_words_en_custom + stop_words_en_custom_numbers words_list_tmp = word_tokenize(string.lower()) words_list = [] lemmatizer = WordNetLemmatizer() for word in nltk.pos_tag(words_list_tmp): tag = get_word_tag(word[1]) if tag is not '': try: synset_word = wordnet.synsets(lemmatizer.lemmatize(word[0], pos=tag), pos=tag)[0] words_list.append(synset_word.name()) except: pass processed_word_list = [word for word in words_list if word not in stop_words_en] text_obj = nltk.Text(processed_word_list) fd = FreqDist(text_obj) result = list(fd.items()) if top_values is not 0: result.sort(key=lambda x: x[1], reverse=True) result = result[:top_values] return result else: return result
def probDist(): ### files pointers to reading files f1 = open(os.path.join('allfiles', 'document01-finance.txt'), "r") f2 = open(os.path.join('allfiles', 'document02-finance.txt'), "r") f3 = open(os.path.join('allfiles', 'document03-finance.txt'), "r") f4 = open(os.path.join('allfiles', 'document04-ee.txt'), "r") f5 = open(os.path.join('allfiles', 'document05-ee.txt'), "r") ### read the file content line1 = f1.read() line2 = f2.read() line3 = f3.read() line4 = f4.read() line5 = f5.read() ### document01-finance.txt is the writer document and other files are ### are reader files so we get the word list from the write document words = line1.split() X_words = [] ### create a dictionary to store the frequency of each term dict_x1 = {} ### using nltk calcuate frequency of each word unigramWordList = FreqDist(words) datalen = len(unigramWordList) ### total words in the document for k,v in unigramWordList.items(): #print k,v X_words.append(k) dict_x1[k] = (v/float(datalen)) pd_x1.append(v/float(datalen)) #print X_words #print dict_x1 #print pd_x1 ### create probability distribution of all files for word in X_words: pd_x2.append( line2.count(word)/float(datalen) ) pd_x3.append( line3.count(word)/float(datalen) ) pd_x4.append( line4.count(word)/float(datalen) ) pd_x5.append( line5.count(word)/float(datalen) ) #print pd_x2 #print pd_x3 ### calculate total probability distribution across 3 files line_S = line1+line2+line3+line4+line5 #print line_S for word in X_words: s.append( line_S.count(word)/float(datalen) ) print s
def opinion_tokens_Fr(liste): #Creat the results floder in case it doesn't exist result = "stats" if not os.path.exists(result): os.mkdir(result,0777) i=0 comments = '' while(i<len(liste)): comments+=liste[i]+'\n' i+=1 comments=comments.lower() #Open the file and write on it the result f = open('opinions.txt', 'w') f.write(comments) f.close() w=['"','→','–','’','»','«',',','.','[',']','|','{','}',':',';','!','?','(',')','_','-','=','/', ' qui ',' cette ',' mais ',' ou ',' où ',' et ',' donc ',' or ',' ni ',' car ',' la ',' là ',' le ', ' les ',' de ',' des ',' du ',' tout ',' tous ',' toutes ',' que ',' comme ',' si ',' quand ',' je ', ' tu ',' il ',' elle ',' nous ',' vous ',' ils ',' elles ',' un ',' une ',' au ',' aux ',' dans ',' ce ' ,' se ',' ces ',' ses ',' on ',' en ',' leur ',' leurs ',' a ',' à ',' pour ',' par ',' sous ',' sur '] #Open the file and write on it the result with codecs.open('opinions.txt','r') as myfile: content=myfile.read() content=content.replace('points forts', ' ') content=content.replace('points faibles', ' ') content=content.replace('commentaires', ' ') # remove numeric forms content = ''.join([i for i in content if not i.isdigit()]) while w: # remove conjuction, connectors, ... content=content.replace(w.pop(0), ' ') content = content.split() tokenDict = FreqDist(content) tokenDict = sorted(tokenDict.items(), key=operator.itemgetter(1), reverse=True) s='' for x in tokenDict: s+= '(\''+x[0].decode('utf-8', 'ignore').encode('utf-8')+'\' , ' +str(x[1])+')\n' fe = open('stats/freq_tokens.txt', 'w') fe.write(s) fe.close() return tokenDict
def main(): # Number of words to display count = 40 # Open files as strings obama = open("obama.txt", "r").read() bush = open("bush.txt", "r").read() #Tokenize texts into words, then count frequencies for all words top_obama = FreqDist(word.lower() for word in word_tokenize(obama)) top_bush = FreqDist(word.lower() for word in word_tokenize(bush)) #Return top {count} most occurring words print "No stoplist".upper() print "Obama/2009\t".upper(), " ".join(item[0] for item in top_obama.items()[:count]) print "Bush/2001\t".upper(), " ".join(item[0] for item in top_bush.items()[:count]) #Return most occurring words that are not in the NLTK English stoplist print print "Stoplisted".upper() print "Obama/2009\t".upper(), " ".join([item[0] for item in top_obama.items() if not item[0] in stopwords.words('english')][:count]) print "Bush/2001\t".upper(), " ".join([item[0] for item in top_bush.items() if not item[0] in stopwords.words('english')][:count])
def CheckSSNStats(FileName): SSNList = [] f = open(FileName, 'r') reader = csv.reader(f, delimiter ='|') for line in reader: SSNList.append(line[3].strip()) SSNStatsDict['SSNCount'] = len(SSNList)-1 fdist = FreqDist(SSNList) frequencies = OrderedDict(sorted(fdist.items(), key = lambda x:x[1], reverse = True)) SSNStatsDict['DistinctSSNCount'] = len(frequencies) for k, v in frequencies.items()[:10]: SSNFDDict[k]=v SSNStatsDict['FreqDist'] = SSNFDDict print "Check SSN Stats: Complete"
def CheckRIDStats(FileName): RIDList = [] f = open(FileName, 'r') reader = csv.reader(f, delimiter ='|') for line in reader: RIDList.append(line[0].strip()) RIDStatsDict['RIDCount'] = len(RIDList)-1 fdist = FreqDist(RIDList) frequencies = OrderedDict(sorted(fdist.items(), key = lambda x:x[1], reverse = True)) RIDStatsDict['DistinctRIDCount'] = len(frequencies) for k, v in frequencies.items()[:10]: RIDFDDict[k]=v RIDStatsDict['FreqDist'] = RIDFDDict print "Check RID Stats: Complete"
def create_word_scores(): posFile = directory + 'posWords.txt' negFile = directory + 'negWords.txt' if os.path.exists(posFile): posSentences = codecs.open(posFile, 'r', 'utf-8') else: print("posFile doesn't exist") if os.path.exists(negFile): negSentences = codecs.open(negFile, 'r', 'utf-8') else: print("negFile doesn't exist") # each line is a single comment posSentences = re.split(r'\n', posSentences.read()) # print posSentences negSentences = re.split(r'\n', negSentences.read()) posWords = [] negWords = [] for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i) posWords.append(posWord) for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posWord_score = [] negWord_score = [] ## low_posWord_score = [] ## low_negWord_score = [] for i in short_pos.split('\n'): posWords = word_tokenize(i) posWord_score.append(posWords) for i in short_neg.split('\n'): negWords = word_tokenize(i) negWord_score.append(negWords) word_scores = {} posWord_score = list(itertools.chain(*posWord_score)) negWord_score = list(itertools.chain(*negWord_score)) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWord_score: word_fd[word.lower()] += 1 cond_word_fd["pos"][word.lower()] += 1 for word in negWord_score: word_fd[word.lower()] += 1 cond_word_fd["neg"][word.lower()] += 1 #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd["pos"].N() neg_word_count = cond_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count #Chi-Squared Informative Gain for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def simhash(words): fdist = FreqDist(words) v = [0]*32 for (token, freq) in fdist.items(): token_hash = [int(val) for val in word_dict[token]] for i in range(freq): for index in range(len(v)): if token_hash[index] == 1: v[index] += 1 else: v[index] -= 1 simhash = ['0']*32 for j in range(len(v)): if v[j] > 0: simhash[j] = '1' return ''.join(simhash)
def get_summarized(self, input, num_sentences,mustinclude): # TODO: allow the caller to specify the tokenizer they want # TODO: allow the user to specify the sentence tokenizer they want tokenizer = RegexpTokenizer('\w+') # get the frequency of each word in the input base_words = [word.lower() for word in tokenizer.tokenize(input)] words = [word for word in base_words if word not in stopwords.words()] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(input) working_sentences = [sentence.lower() for sentence in actual_sentences] mustinclude = mustinclude.lower() # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (mustinclude in working_sentences[i] and word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # If we came up empty just find a sentence with our word that must be included if len(output_sentences) == 0: for i in range(0, len(working_sentences)): if mustinclude in working_sentences[i]: output_sentences.append(actual_sentences[i]) break # sort the output sentences back to their original order return self.reorder_sentences(output_sentences, input)
def freq_words(string): print "\n\n\n\t\t\tReading from file" #tokenize on white spaces raw_word_list=word_tokenize(string) #remove stop words processed_word_list=[word for word in raw_word_list if word not in total_stop_words] #create an nltk text object text_obj=nltk.Text(processed_word_list) print "\n\n\n\t\t\tProcessing" #Call the frequency distribution method and store the words and corresponding frequencies in a dictionary fd=FreqDist(text_obj) #convert the dictionary to a list of tuples conatining key-value pairs result=fd.items() #select the 100 most frequent words. If number of words in the result is less than 100, adjust accordingly if len(result) < 100: result_length=len(result) chosen_words=result[: result_length/2] else: chosen_words=result[:100] print "\n\n\n\t\t\tDrawing cloud" #specify the canvas measurement elements = wordcloud.fit_words(chosen_words, width=500, height=500) #draw the cloud wordcloud.draw(elements, path.join(d, 'frequent_words.png'), width=500, height=500, scale=2) print "\n\n\n\t\t\tWord cloud generated in frequent_words.png file" return
def ComputeVocabulary(): # Get the data csvFile = open("data/comments_study.csv", "Ur") csvReader = csv.reader(csvFile, delimiter=",", quotechar='"') comments = {} for row in csvReader: # don't read 1st line if csvReader.line_num > 1: comments[row[0]] = row # Compute Vocabulary and output it for later tokens = [] n = 0 nDocuments = len(comments) for c in comments: n = n + 1 if n % 100 == 0: print n ct = CleanAndTokenize(comments[c][2].decode("utf8")) ct = [w for w in ct if w not in stopword_list] stemmed_tokens = [porter.stem(t) for t in ct] tokens.extend(stemmed_tokens) for t in stemmed_tokens: if t not in doc_frequency: doc_frequency[t] = 1 else: doc_frequency[t] = doc_frequency[t] + 1 # print tokens fd = FreqDist(tokens) # find cutoff unigram_cutoff = 0 for (i, (key, val)) in enumerate(fd.items()): # print str(i) + " " + str(key) + " " + str(fd[key]) if fd[key] < 10: unigram_cutoff = i - 1 break print "unigram cutoff: " + str(unigram_cutoff) word_features.extend(fd.keys()[:unigram_cutoff]) fileWriter = csv.writer(open("data/vocab.csv", "w+"), delimiter=",") for w in word_features: row = [w.encode("utf8"), doc_frequency[w]] fileWriter.writerow(row)
def create_word_scores(posWords,negWords, presense = False): # (posWords,negWords) = readwordarr() wordscores = {} wordfd = FreqDist() conditionwordfd = ConditionalFreqDist() if not presense: posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) for word in posWords: wordfd[word]+=1 conditionwordfd['pos'][word]+=1 for word in negWords: wordfd[word]+=1 conditionwordfd['neg'][word]+=1 else: for wordarr in posWords: flag = dict() for word in wordarr: if word in flag: continue flag[word]=1 wordfd[word]+=1 conditionwordfd['pos'][word]+=1 for wordarr in negWords: flag = dict() for word in wordarr: if word in flag: continue flag[word]=1 wordfd[word]+=1 conditionwordfd['neg'][word]+=1 pos_word_count = conditionwordfd['pos'].N() neg_word_count = conditionwordfd['neg'].N() totalcount = pos_word_count + neg_word_count for word,freq in wordfd.items(): pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount) neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount) wordscores[word] = pos_score + neg_score return wordscores
def bypassword(db): dist = FreqDist() total = 0 # regex_word_capitalized = r'^[A-Z][a-z]*' regex_pwd_capitalized = r'^[A-Z][^A-Z]*$' while db.hasNext(): fragments = db.nextPwd() pwd = fragments[0].password total += 1 if total % 100000 == 0: print "{} passwords processed...".format(total) pattern = None if all([f.is_gap() for f in fragments]): pattern = 'gap' elif re.match(regex_pwd_capitalized, pwd): pattern = 'title' else: bag = set() for f in fragments: if f.is_gap(): continue raw_word = pwd[f.s_index:f.e_index] if raw_word.isupper(): bag.add('upper') elif raw_word.istitle(): bag.add('captlzd') elif raw_word.islower(): bag.add('lower') else: bag.add('mangled') pattern = ', '.join(sorted(bag)) if pattern == 'captlzd, upper': print pwd dist.inc(pattern) for k, v in dist.items(): print "{}\t{}".format(k, v)
def get_summarized(inputt, num_sentences): # A tokenizer splits a string using a regular expression, which # matches either the tokens or the separators between tokens. tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') # get the frequency of each word in the input base_words = [word.lower() for word in tokenizer.tokenize(inputt)] words = [word for word in base_words if word not in open("Stopwords/dutch")] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. sent_detector = pickle.load(open("Pickles/dutch.pickle", "rb")) actual_sentences = sent_detector.tokenize(inputt) working_sentences = [sentence.lower() for sentence in actual_sentences] # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # sort the output sentences back to their original order return reorder_sentences(output_sentences, inputt)
def ComputeVocabulary(comment_filename, vocab_filename): # Get the data csvFile = open(comment_filename, 'Ur') csvReader = csv.reader(csvFile, delimiter=',', quotechar='"') comments = {} for row in csvReader: # don't read 1st line if csvReader.line_num > 1: comments[row[0]] = row # Compute Vocabulary and output it for later tokens = [] n = 0 nDocuments = len(comments) for c in comments: n = n + 1 if n % 100 == 0 : print "vocabulary : " + str(n) ct = CleanAndTokenize(comments[c][2].decode("utf8")) ct = [w for w in ct if w not in stopword_list] stemmed_tokens = [porter.stem(t) for t in ct] tokens.extend(stemmed_tokens) for t in stemmed_tokens: if t not in doc_frequency: doc_frequency[t] = 1 else: doc_frequency[t] = doc_frequency[t]+1 #print tokens fd = FreqDist(tokens) # find cutoff for (i, (key, val)) in enumerate(fd.items()): # print str(i) + " " + str(key) + " " + str(fd[key]) if fd[key] >= 10: word_features.append(key) fileWriter = csv.writer(open(vocab_filename, "w+"),delimiter=",") for w in word_features: row = [w.encode("utf8"), doc_frequency[w]] fileWriter.writerow(row)
def summarize(self, input, num_sentences ): # TODO: allow the caller to specify the tokenizer they want # TODO: allow the user to specify the sentence tokenizer they want tokenizer = RegexpTokenizer('\w+') # get the frequency of each word in the input base_words = [word.lower().encode('utf-8') for word in tokenizer.tokenize(input)] words = [word for word in base_words if word not in stopwords.words()] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. actual_sentences = nltk.tokenize.sent_tokenize(input) working_sentences = [sentence.lower() for sentence in actual_sentences] # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # sort the output sentences back to their original order output_sentences = self.reorder_sentences(output_sentences, input) # concatinate the sentences into a single string return ' '.join(output_sentences)
def load_data(self, data_set): """ Loads the given data set. Makes the data set case insensitive. Remove words that appear less than 5 times. :return updated data set """ print("Started Loading the Data") tagged_tokens = data_set.tagged_words() tokens = untag(tagged_tokens) # Get the list of words that appear less than 5 times in Corpus print("Get LT5's") tokens = [token.lower() for token in tokens] # Convert to lower case freq_dist = FreqDist(tokens) # Compute the freq dist tokens_lt_5 = [word for word, count in freq_dist.items() if count < 5] # Delete words less than 5 and make the corpus insensitive print("Making data case insensitive") token_range = range(len(tagged_tokens)) indexed_tokens = OrderedDict(zip(token_range,tagged_tokens)) updated_tagged_tokens = OrderedDict() for tagged_token_id, tagged_token in indexed_tokens.items(): if tagged_token[0].lower() in tokens_lt_5: del indexed_tokens[tagged_token_id] else: temp = list() temp.append(tagged_token[0].lower()) temp.append(tagged_token[1]) temp = tuple(temp) updated_tagged_tokens[tagged_token_id] = temp tagged_tokens = list(updated_tagged_tokens.values()) # Pickle the data for future purpose print("Pickling the Updated Corpus") if data_set == brown: file_name = "q5_brown_updated.pkl" else: file_name = "q5_treebank_updated.pkl" pkl.dump((tagged_tokens, tokens_lt_5), open(file_name,'wb')) return tagged_tokens, tokens_lt_5
def update_word_freqeuncy(soup: BeautifulSoup, url, report): #tokenize the content tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z.\'\-_]+") #define a tokenizer url_context = "" for string in soup.stripped_strings: url_context = url_context + string word_list = tokenizer.tokenize(url_context) low_information_flag = False longest = len(word_list) #logestpage update by looking at len(word_list) if longest <= 10: #if word numbers <= 10, it is low information low_information_flag = True #move stop words out stop_words_list = ['a', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apparently', 'approximately', 'are', 'aren', 'arent', 'arise', 'around', 'as', 'aside', 'ask', 'asking', 'at', 'auth', 'available', 'away', 'awfully', 'b', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'beginning', 'beginnings', 'begins', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'between', 'beyond', 'biol', 'both', 'brief', 'briefly', 'but', 'by', 'c', 'ca', 'came', 'can', 'cannot', "can't", 'cause', 'causes', 'certain', 'certainly', 'co', 'com', 'come', 'comes', 'contain', 'containing', 'contains', 'could', 'couldnt', 'd', 'date', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'e', 'each', 'ed', 'edu', 'effect', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'especially', 'et', 'et-al', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'except', 'f', 'far', 'few', 'ff', 'fifth', 'first', 'five', 'fix', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'found', 'four', 'from', 'further', 'furthermore', 'g', 'gave', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'giving', 'go', 'goes', 'gone', 'got', 'gotten', 'h', 'had', 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', 'hed', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'heres', 'hereupon', 'hers', 'herself', 'hes', 'hi', 'hid', 'him', 'himself', 'his', 'hither', 'home', 'how', 'howbeit', 'however', 'hundred', 'i', 'id', 'ie', 'if', "i'll", 'im', 'immediate', 'immediately', 'importance', 'important', 'in', 'inc', 'indeed', 'index', 'information', 'instead', 'into', 'invention', 'inward', 'is', "isn't", 'it', 'itd', "it'll", 'its', 'itself', "i've", 'j', 'just', 'k', 'keep\tkeeps', 'kept', 'kg', 'km', 'know', 'known', 'knows', 'l', 'largely', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'line', 'little', "'ll", 'look', 'looking', 'looks', 'ltd', 'm', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'means', 'meantime', 'meanwhile', 'merely', 'mg', 'might', 'million', 'miss', 'ml', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'mug', 'must', 'my', 'myself', 'n', 'na', 'name', 'namely', 'nay', 'nd', 'near', 'nearly', 'necessarily', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'nor', 'normally', 'nos', 'not', 'noted', 'nothing', 'now', 'nowhere', 'o', 'obtain', 'obtained', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'omitted', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'ord', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'owing', 'own', 'p', 'page', 'pages', 'part', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'poorly', 'possible', 'possibly', 'potentially', 'pp', 'predominantly', 'present', 'previously', 'primarily', 'probably', 'promptly', 'proud', 'provides', 'put', 'q', 'que', 'quickly', 'quite', 'qv', 'r', 'ran', 'rather', 'rd', 're', 'readily', 'really', 'recent', 'recently', 'ref', 'refs', 'regarding', 'regardless', 'regards', 'related', 'relatively', 'research', 'respectively', 'resulted', 'resulting', 'results', 'right', 'run', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'sec', 'section', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sent', 'seven', 'several', 'shall', 'she', 'shed', "she'll", 'shes', 'should', "shouldn't", 'show', 'showed', 'shown', 'showns', 'shows', 'significant', 'significantly', 'similar', 'similarly', 'since', 'six', 'slightly', 'so', 'some', 'somebody', 'somehow', 'someone', 'somethan', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specifically', 'specified', 'specify', 'specifying', 'still', 'stop', 'strongly', 'sub', 'substantially', 'successfully', 'such', 'sufficiently', 'suggest', 'sup', 'sure\tt', 'take', 'taken', 'taking', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that'll", 'thats', "that've", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'thered', 'therefore', 'therein', "there'll", 'thereof', 'therere', 'theres', 'thereto', 'thereupon', "there've", 'these', 'they', 'theyd', "they'll", 'theyre', "they've", 'think', 'this', 'those', 'thou', 'though', 'thoughh', 'thousand', 'throug', 'through', 'throughout', 'thru', 'thus', 'til', 'tip', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'ts', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlike', 'unlikely', 'until', 'unto', 'up', 'upon', 'ups', 'us', 'use', 'used', 'useful', 'usefully', 'usefulness', 'uses', 'using', 'usually', 'v', 'value', 'various', "'ve", 'very', 'via', 'viz', 'vol', 'vols', 'vs', 'w', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'welcome', "we'll", 'went', 'were', 'werent', "we've", 'what', 'whatever', "what'll", 'whats', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'wheres', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whim', 'whither', 'who', 'whod', 'whoever', 'whole', "who'll", 'whom', 'whomever', 'whos', 'whose', 'why', 'widely', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'words', 'world', 'would', 'wouldnt', 'www', 'x', 'y', 'yes', 'yet', 'you', 'youd', "you'll", 'your', 'youre', 'yours', 'yourself', 'yourselves', "you've", 'z', 'zero'] fdic = FreqDist(word_list) for stop_word in stop_words_list: if stop_word in fdic.keys(): del fdic[stop_word] if len(fdic.keys()) <= 10: #if word in word frequency dict <= 10, it is low information low_information_flag = True #update longest page to json file if "----------Longest_Num----------" not in report: report["----------Longest_Num----------"] = 0 report["----------Longest_Url----------"] = [] if longest > report["----------Longest_Num----------"]: report["----------Longest_Num----------"] = longest report["----------Longest_Url----------"] = url if not low_information_flag: # if it has high information value, update word frequency to json file for key, value in fdic.items(): if key in report.keys(): report[key] += fdic[key] else: report[key] = fdic[key]
def create_word_scores(posWords,negWords,objWords): word_fd = FreqDist() #可统计所有词的词频 print(type(word_fd)) cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in negWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 for word in objWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['obj'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 obj_word_count = cond_word_fd['obj'].N() #中性词的数量 total_word_count = pos_word_count + neg_word_count + obj_word_count word_scores = {} for word, freq in word_fd.items(): #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) #一个词的信息量等于积极卡方统计量加上消极卡方统计量 word_scores[word] = pos_score + neg_score + obj_score return word_scores #包括了每个词和这个词的信息量
""" Throwaway script that processes the olac classification results on June 29th to see how many iso codes were identified for each record and for how many records were that number of iso codes identified. """ from operator import itemgetter from nltk.probability import FreqDist fd = FreqDist() results_file = open('olac_iso_identification_results').readlines() # this took like 15 minutes to get. # not too bad, considering it's like all of olac. num_records = len(results_file)+0.0 for line in results_file: record = line.strip().split('\t') iso_list = record[-1].split() fd.inc(len(iso_list)) print "num\tfreq\tpercentage of records" for num, freq in sorted(fd.items(), key=itemgetter(1), reverse=True): print str(num)+'\t'+str(freq)+'\t'+str(freq/num_records) print '' print 'Number of records: '+str(num_records)
"new", "called", "said", "come", "two", "city", "group", "state", "year", "case", "member", "even", "later", "month", "years", "much", "week", "county", "name", "example" "well", "members", "us", "say", "s" } stopwords.update(commonwords) # tokenize and calculate the word frequencies tokens = nltk.tokenize.word_tokenize(txt) fDist = FreqDist(tokens) # print(fDist.most_common(20)) # remove the stop words and common words filtered_fDist = nltk.FreqDist( dict( (word, freq) for word, freq in fDist.items() if word not in stopwords)) # print(words) # words.remove("example") # words.remove("told") # words.remove("become") # words.remove("well") # words.remove("may") # words.remove("june") # words.remove("homosexuals") print('loading model...') model = Word2Vec.load("assets/gay-seattle.w2v") g = nx.DiGraph() items = filtered_fDist.most_common(50) for item in items:
def save_codex_hist_info(self, codex_type, codex_id, constraint=None): """Сохранение частотности слов во всем корпусе""" raw_articles_info = self.parser.sorted_articles_info[codex_type] articles_tokens = list() for article_info in tqdm(raw_articles_info): text = self.parser.get_article_text_by_id(article_info.id) text = text.lower() text = self.remove_chars_from_text(text, self.spec_chars) article_tokens = word_tokenize(' '.join( self.mystem.lemmatize(text))) for stop_word in self.stop_words: while stop_word in article_tokens: article_tokens.remove(stop_word) articles_tokens.extend(article_tokens) text = Text(articles_tokens) f_dist = FreqDist(text) if not constraint: if os.path.exists( generate_file_name_with_postfix( self.config['articles_frequency_info_file'], str(codex_id))): os.remove( generate_file_name_with_postfix( self.config['articles_frequency_info_file'], str(codex_id))) with open(generate_file_name_with_postfix( self.config['articles_frequency_info_file'], str(codex_id)), mode='w') as articles_frequency_info_file: articles_frequency_info_writer = csv.writer( articles_frequency_info_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_frequency_info_writer.writerow(['word', 'frequency']) for frequency_info in f_dist.most_common(100): articles_frequency_info_writer.writerow([ frequency_info[0], frequency_info[1] / len(articles_tokens) ]) else: if os.path.exists( generate_file_name_with_postfix( self. config['articles_frequency_info_file_with_constraint'], str(codex_id))): os.remove( generate_file_name_with_postfix( self. config['articles_frequency_info_file_with_constraint'], str(codex_id))) with open(generate_file_name_with_postfix( self. config['articles_frequency_info_file_with_constraint'], str(codex_id)), mode='w') as articles_frequency_info_file: articles_frequency_info_writer = csv.writer( articles_frequency_info_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_frequency_info_writer.writerow(['word', 'frequency']) f_dist = list( filter(lambda item: item[1] > constraint, f_dist.items())) for frequency_info in f_dist: articles_frequency_info_writer.writerow([ frequency_info[0], frequency_info[1] / len(articles_tokens) ])
print(lexical_diversity(text3)) print(lexical_diversity(text5)) print(percentage(4, 5)) print(percentage(text4.count('a'), len(text4))) # %% fdist1 = FreqDist(text1) fdist1 vocabulary1 = fdist1.keys() print(vocabulary1) print(fdist1['whale']) # %% fdist1.plot(50, cumulative=True) # %% list(fdist1.items())[0:5] # %% fdist1.freq('monstrous') # %% # Total number of samples fdist1.N() # %% fdist1 # %%
pos_dist = FreqDist(pos_list) pos_dist.plot(title="Parts of Speech") for pos, frequency in pos_dist.most_common(pos_dist.N()): print('{:<15s}:{:>4d}'.format(pos, frequency)) # Removing stop words stop = stopwords.words('english') + list(string.punctuation) stop_tokens = [word for word in tagged_tokens if word[0] not in stop] # Removing single character words and simple punctuation stop_tokens = [word for word in stop_tokens if len(word) > 1] # Removing numbers and possive "'s" stop_tokens = [word for word in stop_tokens \ if (not word[0].replace('.','',1).isnumeric()) and \ word[0]!="'s" ] token_dist = FreqDist(stop_tokens) print("\nCorpus contains", len(token_dist.items()), \ " unique terms after removing stop words.\n") for word, frequency in token_dist.most_common(20): print('{:<15s}:{:>4d}'.format(word[0], frequency)) # Lemmatization - Stemming with POS # WordNet Lematization Stems using POS stemmer = SnowballStemmer("english") wn_tags = {'N': wn.NOUN, 'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV} wnl = WordNetLemmatizer() stemmed_tokens = [] for token in stop_tokens: term = token[0] pos = token[1] pos = pos[0] try:
def cal_word_count(): global train_word_id global pos_info global neg_info pos_info = [] neg_info = [] train_word_id = [] word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 print('Loading POS>>>') line_num = 0 with open(pos_file, 'r') as fin: for line in fin: line_num += 1 if not line_num % 10000: print('LINE:%d' % (line_num)) items = line.split() tmp_col = [] for item in items: item_id = term_to_id(item) word_fd[item_id] += 1 cond_word_fd['pos'][item_id] += 1 tmp_col.append(item_id) pos_info.append(tmp_col) print('Loading NEG>>>') line_num = 0 with open(neg_file, 'r') as fin: for line in fin: line_num += 1 if not line_num % 10000: print('LINE:%d' % (line_num)) items = line.split() tmp_col = [] for item in items: item_id = term_to_id(item) word_fd[item_id] += 1 cond_word_fd['neg'][item_id] += 1 tmp_col.append(item_id) neg_info.append(tmp_col) print('Randomize>>>') shuffle(pos_info) shuffle(neg_info) pos_w_count = cond_word_fd['pos'].N() neg_w_count = cond_word_fd['neg'].N() total_w_count = pos_w_count + neg_w_count #print('pos_w_count=%d, neg_w_count=%d, total_w_count=%d'%(pos_w_count, neg_w_count, total_w_count)) #print('word_fd_count=%d'%(word_fd.N())) #计算卡方统计量 global word_scores word_scores = {} print("CALC CHI-SQUARE...") for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_w_count), total_w_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_w_count), total_w_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 del word_fd del cond_word_fd return
def train_classifier(self, dataset, feature_fn_name='word', train_ratio=0.8, verbose=False, token_column='text', target_column='category', best_ratio=0.8, pos_target_val=1, neg_target_val=-1): def word_feats(words): return dict([(word, True) for word in words]) def best_word_feats(words): return dict([(word, True) for word in words if word in bestwords]) def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_feats(words)) return d def best_trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=200): tcf = TrigramCollocationFinder.from_words(words) trigrams = tcf.nbest(score_fn, n) d = dict([(trigram, True) for trigram in trigrams]) d.update(best_bigram_word_feats(words)) d.update(best_word_feats(words)) return d if verbose: print( '\nSelected feature function: {}, token column: {}, train ratio: {}' .format(feature_fn_name, token_column, train_ratio)) df = dataset.sample(frac=1).reset_index(drop=True) negids = df[df[target_column] == neg_target_val].index posids = df[df[target_column] == pos_target_val].index feats = df[token_column] if feature_fn_name in ['best_word', 'best_bigram', 'best_trigram']: word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for tokens in df[df[target_column] == pos_target_val][token_column]: for word in tokens.split(): word_fd[word] += 1 label_word_fd[self._positive_label][word] += 1 for tokens in df[df[target_column] == neg_target_val][token_column]: for word in tokens.split(): word_fd[word] += 1 label_word_fd[self._negative_label][word] += 1 pos_word_count = label_word_fd[self._positive_label].N() neg_word_count = label_word_fd[self._negative_label].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd[self._positive_label][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq( label_word_fd[self._negative_label][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_cnt = int(len(word_scores) * best_ratio) best = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:best_cnt] bestwords = set([w for w, s in best]) if feature_fn_name == 'best_trigram_word_feats': feat_fn = best_trigram_word_feats elif feature_fn_name == 'best_bigram': feat_fn = best_bigram_word_feats else: feat_fn = best_word_feats else: feat_fn = word_feats negfeats = [(feat_fn(feats[i].split()), self._negative_label) for i in negids] posfeats = [(feat_fn(feats[i].split()), self._positive_label) for i in posids] if verbose: print('No. of samples: {}, Pos: {}, Neg: {}'.format( len(feats), len(posfeats), len(negfeats))) negcutoff = int(len(negfeats) * train_ratio) poscutoff = int(len(posfeats) * train_ratio) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = defaultdict(set) testsets = defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) metrics = { 'Accuracy': nltk.classify.util.accuracy(classifier, testfeats), 'Pos precision': precision(refsets[self._positive_label], testsets[self._positive_label]), 'Pos recall': recall(refsets[self._positive_label], testsets[self._positive_label]), 'Neg precision': precision(refsets[self._negative_label], testsets[self._negative_label]), 'Neg recall': recall(refsets[self._negative_label], testsets[self._negative_label]) } if verbose: print(metrics) return classifier, metrics
def setKeywords(self,method='tfidfNoPro',wordCount=10,startCount=0): ''' function to automatically assign keywords if manual ones have not been assigned Inputs ====== method: string Method used to pick automatically defined keywords. Choose from: adjAdv- picks most common adj and adv in text (default and catch if other method doesn't exist) judgement-Under development wordCount: int Number of keywords returned (default 10) startCount: int Index where keywords are extracted (default 0 i.e. start of list) Attributes ========== keywords: list List of keywords automatically generated (can also be assigned outside of function manually) ''' #Save input values self.keywordCount=wordCount self.keywordStar=startCount #Default to 'adjAdv' if method=='adjAdv': #Get total text string txtString=''.join([x for x in self.rawText.values()]) #Get total tag list tagList=tagger.tag(nltk.word_tokenize(txtString)) #Define target dict targetDict={} #Loop through each tag in list and get count of tag and word for tag in tagList: if tag[1] in tagFilterList: word=str.lower(''.join([c for c in tag[0] if c not in string.punctuation])) #Filter out codecerrors if word != 'codecerror': try: targetDict[word]=targetDict[word]+1 except: targetDict[word]=1 #Create data frame with counts and sort targetDF=pd.DataFrame([[k,v] for k,v in targetDict.items()],columns=['word','count']) targetDF.sort(['count'],inplace=True,ascending=False) #Create keywords based on startCount and wordCount ##self.keywords=list(targetDF['word'])[startCount:wordCount+startCount] ### keyRaw=list(targetDF['word'])[startCount:wordCount+startCount] #print(keyRaw) keyStem=[stemmer.stem(word) for word in keyRaw] #print(keyStem) self.keywords = keyStem elif method=='tfidf': # get all tokens for the fileList all_words = [] for toke in self.tokens.values(): all_words = all_words + toke ## create FreqDF with word frequencies from fileList freq = FreqDist(all_words) columns_obj = ["term", "freq"] freqDF = pd.DataFrame(freq.items(), columns=columns_obj) # convert it to a data frame freqDF = freqDF.set_index('term') ## merge freqDF with idf data frame freqit = freqDF.join(self.idf[['idf', 'logidf']]) # replace null values with max (i.e. if word isn't found, give it the value of the most unique word in IDF) maxidf = max(freqit['idf'].dropna()) maxlogidf = max(freqit['logidf'].dropna()) freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf ## create tfidf columns freqit['tfidf'] = freqit['freq'] * freqit['idf'] freqit['logtfidf'] = freqit['freq'] * freqit['logidf'] ## order by tfidf weight freqit = freqit.sort_values(by='tfidf', ascending=False) #filter out codecerror keyslist = freqit.iloc[startCount:wordCount+startCount].index.tolist() keywords = [] for word in keyslist: if (word != 'codecerror') & (word != ''): keywords = keywords + [word] ## self.keywords = keywords elif method=='tfidfNoPro': # get all tokens for the fileList all_words = [] for toke in self.tokens.values(): all_words = all_words + toke ## create FreqDF with word frequencies from fileList freq = FreqDist(all_words) columns_obj = ["term", "freq"] freqDF = pd.DataFrame(freq.items(), columns=columns_obj) # convert it to a data frame #freqDF = freqDF.set_index('term') ## drop the pronouns terms = freqDF['term'].values.tolist() #noPro = drop_pronouns(terms[1:]) noPro = drop_pronouns(terms) freqDF = freqDF.set_index('term') freqDF = freqDF.ix[noPro] ## merge freqDF with idf data frame freqit = freqDF.join(self.idf[['idf', 'logidf']]) # replace null values with max maxidf = max(freqit['idf'].dropna()) maxlogidf = max(freqit['logidf'].dropna()) freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf ## create tfidf columns freqit['tfidf'] = freqit['freq'] * freqit['idf'] freqit['logtfidf'] = freqit['freq'] * freqit['logidf'] ## order by tfidf weight freqit = freqit.sort_values(by='tfidf', ascending=False) #filter out codecerror #keyslist = freqit.iloc[startCount:wordCount+startCount].index.tolist() keyslist = freqit.index.tolist() keywords = [] for word in keyslist: if (word != 'codecerror') & (word != '') & (len(word)>2): keywords = keywords + [word] ## self.keywords = keywords[startCount:wordCount+startCount] elif method=='manual': # Pull data from the csv file #filepath = "/Users/samanthagarofalo/Documents/Data Science/Capstone/Keywords.csv" # Create dataframe with manually entered keywords targetDF = pd.read_csv(manualKeywordFilePath) # User input to select the group that we are looking at keywords for #group = input("Which group would you like to look at? ") try: keywords = list(targetDF.Keywords[targetDF['Group'] == self.group]) #print(keywords) except: keywords = ['this didnt work'] print(keywords) for element in keywords: keywords = element.split(' ') if len(keywords) == 0: print('%%%%\nNO KEYWORDS FOUND: using tfidf by default\n%%%%') self.setKeywords(method='tfidf',wordCount=wordCount,startCount=startCount) else: self.keywords = keywords #Judgement method elif method=='judgement': posList=nounList+tagFilterList #Define target dict targetDict={} for fileName in self.fileList: for judgementStr in self.judgements[fileName]: tagList=tagger.tag(nltk.word_tokenize(judgementStr)) #Loop through each tag in list and get count of tag and word for tag in tagList: if tag[1] in posList: word=str.lower(''.join([c for c in tag[0] if c not in string.punctuation])) #Stem words if useStem True newStopWords=stopWords if self.useStem: word=stemmer.stem(word) newStopWords=[stemmer.stem(x) for x in stopWords] #Remove stopwords if useStopwords ==False if not self.useStopwords: newStopWords.append("") #Filter out codecerrors if word not in ['codecerror']+[' ']+newStopWords: try: targetDict[word]=targetDict[word]+1 except: targetDict[word]=1 #Create data frame with counts and sort targetDF=pd.DataFrame([[k,v] for k,v in targetDict.items()],columns=['word','count']) targetDF.sort(['count'],inplace=True,ascending=False) #Create keywords based on startCount and wordCount #self.keywords=list(targetDF['word'])[startCount:wordCount+startCount] ### keyRaw=list(targetDF['word'])[startCount:wordCount+startCount] #print(keyRaw) keyStem=[stemmer.stem(word) for word in keyRaw] #print(keyStem) self.keywords = keyStem else: print('ERROR: Method not found')
file_content = open("input_text.txt").read() tokens = word_tokenize(file_content) print('\nTokens List:\n') print(tokens) get_ngram(tokens, 3) obwords = word_tokenize(inaugural.raw('2009-Obama.txt')) waswords = word_tokenize(inaugural.raw('1789-Washington.txt')) print('\n\nOBAMA') ob = FreqDist(obwords) print('No. of words:', len(obwords)) print('No. of distinct words:', len(ob.keys())) sortob = sorted(ob.items(), key=lambda x: x[1]) print('\n\nOBAMA50-', sortob[-50:]) was = FreqDist(waswords) sortwas = sorted(was.items(), key=lambda x: x[1]) print('\n\nWASHINGTON0-', sortob[-50:], '\n\n') obuni = FreqDist(list(ngrams(obwords, 1))) obbi = FreqDist(list(ngrams(obwords, 2))) obtri = FreqDist(list(ngrams(obwords, 3))) sortobuni = sorted(obuni.items(), key=lambda x: x[1]) sortobbi = sorted(obbi.items(), key=lambda x: x[1]) sortobtri = sorted(obtri.items(), key=lambda x: x[1]) print("Unigrams: ", sortobuni[-10:]) print("Bigrams: ", sortobbi[-10:])
"When used in this sense, the term adopts a meaning reminiscent of receptive fields in actual biological nervous systems." ] ##for s in sentences: ## local = s ## local = re.sub('[^a-zA-Z0-9- ]', '', local) ## local = local.split() ## print(local) tokens = word_tokenize(sentences[0]) frequencies = FreqDist(tokens) #for key, value in frequencies.items(): # print(key, '-> ', value) tuples = list(frequencies.items()) print(tuples) tuples.sort(key=lambda x: x[0]) print(tuples) from nltk.stem import WordNetLemmatizer lemma = WordNetLemmatizer() print(lemma.lemmatize('apples')) print(lemma.lemmatize('dies')) from nltk.stem import PorterStemmer # LancasterStemmer stem = PorterStemmer()
print '%d tags' % len(tag_counts) print '%d IOBs\n' % len(iob_counts) if args.sort == 'tag': sort_key = lambda (t, c): t elif args.sort == 'count': sort_key = lambda (t, c): c else: raise ValueError('%s is not a valid sort option' % args.sort) line1 = ' Tag Count ' line2 = '======= =========' iobs = sorted(iob_counts.keys()) for iob in iobs: line1 += ' %s ' % iob line2 += ' ==%s==' % ('=' * len(iob)) print line1 print line2 for tag, count in sorted(tag_counts.items(), key=sort_key, reverse=args.reverse): iob_counts = [ str(tag_iob_counts[tag][iob]).rjust(4 + len(iob)) for iob in iobs ] print ' '.join([tag.ljust(7), str(count).rjust(9)] + iob_counts) print line2
from nltk.probability import FreqDist from nltk.corpus import treebank fd = FreqDist() for word, tag in treebank.tagged_words(): fd[tag] += 1 tags = list(fd.items()) tags.sort(key=lambda (tag, freq): tag) for tag, freq in tags: print('{0}\t\t\t{1}'.format(tag, freq))
word_fd[ word] += 1 #After receiving the parameter 'words', the frequency of each' word 'in' words' will be counted and a dictionary will be returned. Key is' word ', and value is the number of occurrences of word in words. label_word_fd['pos'][word] += 1 for f in neg_sentence_list: for word in f: word_fd[word] += 1 label_word_fd['neg'][word] += 1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #用BigramAssocMeasures.chi_sq函数为词汇计算pos评分 neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=lambda s: s[1], reverse=True)[:1000] bestwords = set([w for w, s in best]) def high_information_feats(words): return dict([(word, True) for word in words if word in bestwords])
def create_word_scores(): angerWords, disgustWords, fearWords, joyWords, surpriseWords = [], [], [], [], [] with open(ANGER_FILE, 'r', errors="ignore", encoding="utf-8") as angerSentence: for i in angerSentence: angerWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) angerWords.append(angerWord) with open(DISGUST_FILE, 'r', errors="ignore", encoding="utf-8") as disgustSentence: for i in disgustSentence: disgustWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) disgustWords.append(disgustWord) with open(FEAR_FILE, 'r', errors="ignore", encoding="utf-8") as fearSentence: for i in fearSentence: fearWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) fearWords.append(fearWord) with open(JOY_FILE, 'r', errors="ignore", encoding="utf-8") as joySentence: for i in joySentence: joyWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) joyWords.append(joyWord) with open(SURPRISE_FILE, 'r', errors="ignore") as surpriseSentence: for i in surpriseSentence: surpriseWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) surpriseWords.append(surpriseWord) angerWords = list(itertools.chain(*angerWords)) disgustWords = list(itertools.chain(*disgustWords)) fearWords = list(itertools.chain(*fearWords)) joyWords = list(itertools.chain(*joyWords)) surpriseWords = list(itertools.chain(*surpriseWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in angerWords: word_fd[word.lower()] += 1 cond_word_fd['anger'][word.lower()] += 1 for word in disgustWords: word_fd[word.lower()] += 1 cond_word_fd['disgust'][word.lower()] += 1 for word in fearWords: word_fd[word.lower()] += 1 cond_word_fd['fear'][word.lower()] += 1 for word in joyWords: word_fd[word.lower()] += 1 cond_word_fd['joy'][word.lower()] += 1 for word in surpriseWords: word_fd[word.lower()] += 1 cond_word_fd['surprise'][word.lower()] += 1 anger_word_count = cond_word_fd['anger'].N() disgust_word_count = cond_word_fd['disgust'].N() fear_word_count = cond_word_fd['fear'].N() joy_word_count = cond_word_fd['joy'].N() surprise_word_count = cond_word_fd['surprise'].N() total_word_count = anger_word_count + disgust_word_count + fear_word_count + joy_word_count + surprise_word_count word_scores = {} for word, freq in word_fd.items(): anger_score = BigramAssocMeasures.chi_sq(cond_word_fd['anger'][word], (freq, anger_word_count), total_word_count) disgust_score = BigramAssocMeasures.chi_sq( cond_word_fd['disgust'][word], (freq, disgust_word_count), total_word_count) fear_score = BigramAssocMeasures.chi_sq(cond_word_fd['fear'][word], (freq, fear_word_count), total_word_count) joy_score = BigramAssocMeasures.chi_sq(cond_word_fd['joy'][word], (freq, joy_word_count), total_word_count) surprise_score = BigramAssocMeasures.chi_sq( cond_word_fd['surprise'][word], (freq, surprise_word_count), total_word_count) word_scores[ word] = anger_score + disgust_score + fear_score + joy_score + surprise_score return word_scores
# 1 from nltk.corpus import cess_esp # 2 print("2.", len(cess_esp.words())) # 3 print("3.", len(cess_esp.sents())) # 4 from nltk.probability import FreqDist first_file = cess_esp.fileids()[0] cess_freq0 = FreqDist(cess_esp.words(first_file)) print("4.", cess_freq0.most_common(20)) # 5 print("5.", [w for w, k in cess_freq0.most_common()]) # 6 print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2]) # 7 print("7.", [k for w, k in cess_freq0.most_common()]) print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0)) # 8 print("8. No de palabras que aparecen una sola vez:", len([w for w, k in cess_freq0.items() if k == 1])) # 9 print("9. La palabra más frecuente es", cess_freq0.max()) # 10 from nltk.corpus import PlaintextCorpusReader mycorpus = PlaintextCorpusReader("../res/", ".*") # 11 print("11.") for doc in mycorpus.fileids():
def removeStopWords(self, corpus): ''' Remove Stop Words, Punctuations, Special Characters, Rare Words Fill data structures for plotting the data distribution Stop words list is taken from NLTK ''' from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.tokenize.treebank import TreebankWordDetokenizer nltk.download('words') nltk.download('stopwords') nltk.download('punkt') stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(corpus) filtered_sentence = [] stop_word_count = 0 punctuation_cnt = 0 digit_cnt = 0 punc = set(string.punctuation) words = set(nltk.corpus.words.words()) nonEnglish = 0 for w in word_tokens: # Calculate non english words if w.lower() not in words: nonEnglish += 1 # Calculate Punctuation Count if w in punc: punctuation_cnt += 1 # Calculate Nuymeric Digits Count elif w.isnumeric(): digit_cnt += 1 elif w not in stop_words: filtered_sentence.append(w) else: stop_word_count += 1 # Remove rare words if REMOVE_RARE_WORDS: fdist = FreqDist(filtered_sentence) # Get list of words with frequency less than 5 rare_words = list(filter(lambda x: x[1] <= 5, fdist.items())) if DEBUG_PRINT: print(rare_words) for word in filtered_sentence: for entry in rare_words: if word in entry: filtered_sentence.remove(word) # Update internal data structure used in plotting corpus_attributes.update({"Stop_Words_Count": (stop_word_count)}) corpus_attributes.update({"Punctuation_Count": (punctuation_cnt)}) corpus_attributes.update({"No_Of_Words": (len(word_tokens))}) corpus_attributes.update({"Number_Count": (digit_cnt)}) corpus_attributes.update({"Non_english_word_Count": (nonEnglish)}) if DEBUG_PRINT: print('stop_word_count', stop_word_count) print('punctuation_cnt', punctuation_cnt) print('No_Of_Words', (len(word_tokens))) print('Number_Count', digit_cnt) print('Non_english_word_Count', nonEnglish) return TreebankWordDetokenizer().detokenize(filtered_sentence)
def topwords(): """ inspired by http://www.huffingtonpost.com/brian-honigman/the-100-most-popular-hash_b_2463195.html http://editd.com/features/monitor/ used these resources for understanding nltk usage http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/ http://text-processing.com/demo/sentiment/ http://ravikiranj.net/drupal/201205/code/machine-learning/how-build-twitter-sentiment-analyzer http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/ http://fashionweekdates.com/world-fashion-week-dates-schedule.html """ ## place tweets into morning and afternoon bins ru = db.GqlQuery("SELECT * FROM Tweets where iso!=:1", 'en').fetch(limit=1000) en = db.GqlQuery("SELECT * FROM Tweets where iso=:1", 'en').fetch(limit=1000) #this is used because nltk.corpus.stopwords.words('english') doesnt work in GAE # from https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords stop = "a, about, above, across, after, again, against, all, almost, alone, along, already, also, although, always, am, among, an, and, another, any, anybody, anyone, anything, anywhere, are, area, areas, aren't, around, as, ask, asked, asking, asks, at, away, b, back, backed, backing, backs, be, became, because, become, becomes, been, before, began, behind, being, beings, below, best, better, between, big, both, but, by, c, came, can, cannot, can't, case, cases, certain, certainly, clear, clearly, come, could, couldn't, d, did, didn't, differ, different, differently, do, does, doesn't, doing, done, don't, down, downed, downing, downs, during, e, each, early, either, end, ended, ending, ends, enough, even, evenly, ever, every, everybody, everyone, everything, everywhere, f, face, faces, fact, facts, far, felt, few, find, finds, first, for, four, from, full, fully, further, furthered, furthering, furthers, g, gave, general, generally, get, gets, give, given, gives, go, going, good, goods, got, great, greater, greatest, group, grouped, grouping, groups, h, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, her, here, here's, hers, herself, he's, high, higher, highest, him, himself, his, how, however, how's, i, i'd, if, i'll, i'm, important, in, interest, interested, interesting, interests, into, is, isn't, it, its, it's, itself, i've, j, just, k, keep, keeps, kind, knew, know, known, knows, l, large, largely, last, later, latest, least, less, let, lets, let's, like, likely, long, longer, longest, m, made, make, making, man, many, may, me, member, members, men, might, more, most, mostly, mr, mrs, much, must, mustn't, my, myself, n, necessary, need, needed, needing, needs, never, new, newer, newest, next, no, nobody, non, noone, nor, not, nothing, now, nowhere, number, numbers, o, of, off, often, old, older, oldest, on, once, one, only, open, opened, opening, opens, or, order, ordered, ordering, orders, other, others, ought, our, ours, ourselves, out, over, own, p, part, parted, parting, parts, per, perhaps, place, places, point, pointed, pointing, points, possible, present, presented, presenting, presents, problem, problems, put, puts, q, quite, r, rather, really, right, room, rooms, s, said, same, saw, say, says, second, seconds, see, seem, seemed, seeming, seems, sees, several, shall, shan't, she, she'd, she'll, she's, should, shouldn't, show, showed, showing, shows, side, sides, since, small, smaller, smallest, so, some, somebody, someone, something, somewhere, state, states, still, such, sure, t, take, taken, than, that, that's, the, their, theirs, them, themselves, then, there, therefore, there's, these, they, they'd, they'll, they're, they've, thing, things, think, thinks, this, those, though, thought, thoughts, three, through, thus, to, today, together, too, took, toward, turn, turned, turning, turns, two, u, under, until, up, upon, us, use, used, uses, v, very, w, want, wanted, wanting, wants, was, wasn't, way, ways, we, we'd, well, we'll, wells, went, were, we're, weren't, we've, what, what's, when, when's, where, where's, whether, which, while, who, whole, whom, who's, whose, why, why's, will, with, within, without, won't, work, worked, working, works, would, wouldn't, x, y, year, years, yes, yet, you, you'd, you'll, young, younger, youngest, your, you're, yours, yourself, yourselves, you've, z" stopwordsenglish = re.findall(r'\w+', stop, flags = re.UNICODE | re.LOCALE) stopwordstwitter = ['http', '#', '@', '!', ':', ';', '&', '\'', '-', 't', 'co', 'rt'] stopwords_list = stopwordsenglish + stopwordstwitter freq1 = FreqDist() freq2 = FreqDist() for t in ru: #We only want to work with lowercase for the comparisons sentence = t.tweet.lower() #remove punctuation and split into seperate words words = re.findall(r'\w+', sentence, flags=re.UNICODE | re.LOCALE) #corpus = nltk.word_tokenize(words) for a in words: if a not in stopwords_list: freq1.inc(a) for t in en: #We only want to work with lowercase for the comparisons sentence = t.tweet.lower() #remove punctuation and split into seperate words words = re.findall(r'\w+', sentence, flags=re.UNICODE | re.LOCALE) #corpus = nltk.word_tokenize(t.tweet) for a in words: if a not in stopwords_list: freq2.inc(a) #display results #bins = freq1.B() # Returns: The total number of sample bins with counts > 0 f1 = freq1.items()[:90] # Returns: List of all items in tuple format f2 = freq2.items()[:90] context = {'one': f1, 'two': f2, 'stop': stopwords_list } return render_template('topwords.html', **context)
# ANALYSE SPEECH FROM TRANSCRIPTS # This algorithm reads the speech text collected from the webscraping # algorithms and uses NLTK functions to analyse the most frequent words used. from nltk.tokenize import word_tokenize from nltk.probability import FreqDist import matplotlib.pyplot as plt import csv # Read in the speech data with open("speechData.txt", "r") as words: text = words.read() text = str(text).replace("][", ", ") # Retokenise words and create frequency distribution words = word_tokenize(str(text)) fdist = FreqDist(words) # Write the results into a csv file with open("frequency.csv", "w") as fp: writer = csv.writer(fp, quoting=csv.QUOTE_ALL) writer.writerows(fdist.items())
def save_unique_words_analysis(self, uniqueness_threshold): """Сохраняем информацию о количестве уникальных слов и количестве статей, в которых эти слова встречаются, а также информацию о заданном количестве уникальных слов""" articles_tokens = list() articles_words_info = dict() for (codex_type, _) in tqdm(self.parser.codex_urls): raw_articles_info = self.parser.sorted_articles_info[codex_type] for article_info in tqdm(raw_articles_info): text = self.parser.get_article_text_by_id(article_info.id) text = text.lower() text = self.remove_chars_from_text(text, self.spec_chars) article_tokens = word_tokenize(' '.join( self.mystem.lemmatize(text))) for stop_word in self.stop_words: while stop_word in article_tokens: article_tokens.remove(stop_word) articles_words_info[self.get_unique_article_identifier( codex_type, article_info.id)] = list(set(article_tokens)) articles_tokens.extend(article_tokens) text = Text(articles_tokens) f_dist = FreqDist(text) f_dist = list( filter(lambda item: item[1] <= uniqueness_threshold, f_dist.items())) unique_words_info = dict() # Сохраняем информацию в виде: 'уникальное слово': ['количество во всем корпусе', 'количество статей, в котром встретилось это слово'] for word_info in f_dist: if word_info[0] not in unique_words_info: unique_words_info[word_info[0]] = [word_info[1], 0] for article_id in tqdm(articles_words_info): if word_info[0] in articles_words_info[article_id]: unique_words_info[word_info[0]][1] += 1 if os.path.exists(self.config['articles_unique_words_info_file']): os.remove(self.config['articles_unique_words_info_file']) with open(self.config['articles_unique_words_info_file'], mode='w') as articles_unique_words_info_file: articles_unique_words_info_writer = csv.writer( articles_unique_words_info_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_unique_words_info_writer.writerow( ['word', 'word_count', 'articles_count']) for info in unique_words_info.items(): articles_unique_words_info_writer.writerow( [info[0], info[1][0], info[1][1]]) unique_words_metrics = dict() # Сохраняем информацию в виде: 'заданное количество слова во всем корпусе': 'количество таких слов во всем корпусе' for value in unique_words_info.values(): if value[0] not in unique_words_metrics: unique_words_metrics[value[0]] = value[1] else: unique_words_metrics[value[0]] += value[1] if os.path.exists(self.config['articles_unique_words_analysis_file']): os.remove(self.config['articles_unique_words_analysis_file']) with open(self.config['articles_unique_words_analysis_file'], mode='w') as articles_unique_words_analysis_file: articles_unique_words_analysis_writer = csv.writer( articles_unique_words_analysis_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_unique_words_analysis_writer.writerow([ 'count_unique_words_frequency', 'count_unique_words_in_corpus' ]) for info in unique_words_metrics.items(): articles_unique_words_analysis_writer.writerow( [info[0], info[1]]) if os.path.exists( self. config['articles_unique_words_analysis_file_with_frequency']): os.remove( self. config['articles_unique_words_analysis_file_with_frequency']) with open(self. config['articles_unique_words_analysis_file_with_frequency'], mode='w') as articles_unique_words_analysis_file: articles_unique_words_analysis_writer = csv.writer( articles_unique_words_analysis_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_unique_words_analysis_writer.writerow([ 'count_unique_words_frequency', 'count_unique_words_in_corpus_frequency' ]) for info in unique_words_metrics.items(): articles_unique_words_analysis_writer.writerow( [info[0], info[1] / len(articles_tokens)])
simplefilter(action='ignore', category=FutureWarning) df = pd.read_csv("chatbot_classes.csv") ques_words = [] list_sent = list(df['question']) for sentence in list_sent: for words in sentence.split(): ques_words.append(words) fd = FreqDist() for word in ques_words: fd[word.lower()] += 1 labels, keys = zip(*fd.items()) labels = [] keys = [] for T in fd.most_common(10): labels.append(T[0]) keys.append(T[1]) #function removing punctuations and lower the words def lower_punc(text): w = [] for word in text.split(): w.append(word.lower()) wd = []
def subsetManual(c_uncleaned, c_global): """ Esta función genera dos diccionarios en orden ascendente e inverso respecto del conteo de términos a nivel global utilizando el corpus_lookup sin curar y ejecutando la técnica lookup entre este corpus y el global curado para así obtener este análisis. Esta estrategia permite también obtener una selección de los 500 términos con menos frecuencia para utilizarlos para el mapeo manual a HPO.Los parámetros son los path hacia el corpus_lookup sin curar y el corpus global generado con la función mergeCorpus. """ # parte 1: texto sin curar -> texto with open(c_uncleaned, 'r') as f: texto = [] lines = f.readlines() for line in lines: texto.append(line) del (lines) f.closed # parte 2: corpus global -> texto_p with open(c_global, 'r') as f: texto_p = [] lines = f.readlines() for line in lines: texto_p.append(line) del (lines) f.closed # parte 3: lookup para el conteo -> mapped mapped = [] for line in texto: found = [] for item in texto_p: if item in line: found.append(item) #print('item found:', item) mapped.append(found) # compactar mapped -> mapped_flat mapped_flat = list(itertools.chain.from_iterable(mapped)) # eliminar saltos de página -> mapped_final mapped_final = [] for ele in mapped_flat: mapped_final.append(ele.replace('\n', '')) # parte 4: conteo de términos counter = collections.Counter(mapped_final) # diccionario ordenado por valor d = counter.items() # diccionario en orden ascendente al conteo de términos d_ord_increasing = collections.OrderedDict( sorted(sorted(d), key=lambda t: t[1])) # diccionario en orden descendente al conteo de términos d_ord_decreasing = collections.OrderedDict( sorted(sorted(d), key=lambda t: t[1], reverse=True)) # DataFrame to write excel by ascending or descending order df = pd.DataFrame(data=d_ord_decreasing, index=['count']).T df1 = pd.DataFrame(data=d_ord_increasing, index=['count']).T df.to_excel(excel_writer='conteo_terminos_increasing.xlsx') df1.to_excel(excel_writer='conteo_terminos_decreasing.xlsx') # parte 5: extracción 500 términos para validación manual fdist = FreqDist(mapped_final) all_items = fdist.items() terms = fdist.most_common()[-500:] terminos = [] for i in terms: terminos.append(i[0]) # parte 6: guardar los términos seleccionados para el mapeo manual outfile = 'términos_mapeo_manual.txt' with open(outfile, 'w') as w: w.write('\n'.join(terminos)) return all_items
def lexicalDiversity( text): #this function shows how many times on average a word is used return len(text) / len(set( text)) #larger results mean less diversity, lower is high diversity # print(lexicalDiversity(text3)) ''' Frequency distributions show the tallies of each word used ''' from nltk.probability import FreqDist fdist = FreqDist( text3 ) #this gives the frequency distribution of every word. i.e. tallies of each word used vocab = fdist.items() #list/dict of words from freq dist with keys and values # print(vocab) # hapaxes=fdist.hapaxes() #hapaxes are words that only appear once # fdist.plot(25) #plot 25 most common tokens # fdist.tabulate() #now we can try to filter out and only get important words of a critical length and occurrence uniqueWord = set(text3) importantWords = [wd for wd in uniqueWord if len(wd) > 5 and fdist[wd] > 10] # print(sorted(importantWords)) #Collocations - show words that appear together most often c = text3.collocations() # print(c) #Extract num chars, words and sents
import nltk from nltk.probability import FreqDist sense = nltk.corpus.gutenberg.words('austen-sense.txt') fdist = FreqDist(sense) rank = 0 for item in fdist.items(): rank = rank + 1 result = str(rank) + " " + str(item[1]) + " " + str(item[0]) print result
# Leemos los archivos, tokenizamos y sacamos las frecuencias de las palabras. # He usado dos textos de Philip K. Dick : "La segunda variedad" y "El hombre variable" # Se pueden cambiar por los que tengas disponibles :) pkdsv = read_file('PKD_Segunda_variedad.txt') text_sv = nltk.word_tokenize(pkdsv) fdist_sv = FreqDist(len(palabra) for palabra in text_sv) pkdhv = read_file('PKD_Hombre_variable.txt') text_hv = nltk.word_tokenize(pkdhv) fdist_hv = FreqDist(len(palabra) for palabra in text_hv) # Ordenamos las longitudes de las palabras con sus frecuencias. sorted_by_first_hv = sorted(fdist_hv.items(), key=lambda tup: tup[0]) sorted_by_first_sv = sorted(fdist_sv.items(), key=lambda tup: tup[0]) # Convierto las listas de tuplas en diccionarios. dict_hv = dict(sorted_by_first_hv) dict_sv = dict(sorted_by_first_sv) # Monto un nuevo diccionario con los elementos comunes de ambas y los valores del primero dict_hv_2 = {} for key in dict_hv: if key in dict_sv: dict_hv_2.update({key: dict_hv[key]}) # Monto un nuevo diccionario con los elementos comunes de ambas y los valores del primero dict_sv_2 = {} for key in dict_sv:
def answer_three(): wordfreq = FreqDist(text1) wordfreq_desc = sorted(wordfreq.items(), key=lambda x: x[1], reverse=True) return wordfreq_desc[:20] # Your answer here
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from string import punctuation from nltk.probability import FreqDist from heapq import nlargest from collections import defaultdict import pandas as pd with open('shengine.txt') as f1: data = f1.read().decode('utf-8', 'ignore').replace("\n", " ").replace( u"\u2019", "'").replace(u"\u2018", "'") data.encode('ascii', 'ignore') sents = sent_tokenize(data) for i in sents: i.encode('ascii', 'ignore') words = word_tokenize(data.lower()) mystops = set(stopwords.words('english') + list(punctuation)) words = [word for word in words if word not in mystops] freq = FreqDist(words) print(pd.DataFrame(list(freq.items()), columns=["Word", "Frequency"])) #print(nlargest(10,freq, key = freq.get)) #count for the word ranking = defaultdict(int) for i, sent in enumerate(sents): for w in word_tokenize(sent.lower()): if w in freq: ranking[i] += freq[w] final_numbers = nlargest(3, ranking, key=ranking.get) print(final_numbers) print([sents[j] for j in sorted(final_numbers)])
def TrainingSynonymCheck(new_words, all_words): realwords = [] for i in all_words: if i.isalpha(): realwords.append(i) lemmatized_words = [] for i in realwords: n = lemmatizer.lemmatize(i) lemmatized_words.append(str(n)) FreqDictionary = FreqDist(lemmatized_words) l = sorted(FreqDictionary.items(), key=operator.itemgetter(1), reverse=True) #print l unique_words = [] for i in l: unique_words.append(i[0]) # Taking first words from all synsets # synDict={} # flag=0 # for word in unique_words: # if word.isalpha(): # syns=wn.synsets(word) # lst=[] # for s in syns: # a=s.lemmas()[0].name() # if a!=word and (a not in lst): # lst.append(str(a)) # if lst!=[]: # synDict[word]=(lst) # Taking the first synset synDict = {} flag = 0 for word in unique_words: if word.isalpha(): syns = wn.synsets(word) lst = [word] if syns != []: #for s in syns: s = syns[0] a = s.lemmas() for i in a: f = i.name() if f != word and (f not in lst): lst.append(str(f)) if lst != []: synDict[word] = (lst) synunique_words = copy.deepcopy( unique_words ) ## synunique_words : copy that contains all the unique words in the questions initially for word in unique_words: i = unique_words.index(word) if word in synDict.keys(): for syn in synDict[word]: for j in unique_words[i + 1:]: if syn == j: idx = unique_words.index(j) unique_words[idx] = word return [ unique_words, synunique_words ] # unique words : changed words , synunique_words : original words unique
def train_and_test(reviews_pos, reviews_neg): """ 훈련 및 테스트 :param reviews_pos: 긍정 리뷰 list :param reviews_neg: 부정 리뷰 list :return: """ # 긍정 리뷰, 부정 리뷰 각각에서의 전체 단어에 대한 빈도수 계산 tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l] tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l] word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in tot_poswords: word_fd[word.lower()] += 1 label_word_fd['pos'][word.lower()] += 1 for word in tot_negwords: word_fd[word.lower()] += 1 label_word_fd['neg'][word.lower()] += 1 pos_words = len(tot_poswords) neg_words = len(tot_negwords) tot_words = pos_words + neg_words # 각 단어별 점수 word_scores = {} for word, freq in iter(word_fd.items()): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_words), tot_words) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_words), tot_words) word_scores[word] = pos_score + neg_score print('total: ', len(word_scores)) # 점수가 높은 10000개의 단어만 추출 best = sorted(iter(word_scores.items()), key=lambda args: args[1], reverse=True)[:10000] bestwords = set([w for w, s in best]) negfeatures = [(best_words_features(r.words, bestwords), 'neg') for r in reviews_neg] posfeatures = [(best_words_features(r.words, bestwords), 'pos') for r in reviews_pos] # 훈련 집합 80%와 테스트 집합 20% 분리 portionpos = int(len(posfeatures) * 0.8) portionneg = int(len(negfeatures) * 0.8) print(portionpos, '-', portionneg) trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg] print(len(trainfeatures)) # 훈련 classifier = NaiveBayesClassifier.train(trainfeatures) # 테스트 testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:] shuffle(testfeatures) err = 0 print('test on: ', len(testfeatures)) for r in testfeatures: sent = classifier.classify(r[0]) # print(r[1], '-pred: ', sent) if sent != r[1]: err += 1. print('error rate: ', err / float(len(testfeatures)))