def process(f, return_tokens=True, return_freqdist=True): """ Function to process deals data. Splits text into sentences. FreqDist is incremented from tokenization. Using PunktWordTokenizer, since it is a decent regexp-based tokenizer. Deals are also about domain names. Not intending to split it up :rtype : FreqDist, list() of str :param f: Input file with a deal per line """ fd = FreqDist() tokens = [] fh = open(f, 'r') sentences = [line.strip() for line in fh.readlines()] for line in sentences: t = [] for word in PunktWordTokenizer().tokenize(line.lower()): if word not in set(stopwords.words('english')) and word not in set(string.punctuation): if return_tokens: t.append(word) if return_freqdist: fd.inc(word) tokens.append(t) fh.close() return fd, sentences, tokens
def get_word_features(wordlist): wordlist = FreqDist(wordlist) word_features = wordlist.keys() return word_features
def __init__( self, unk_cutoff, jm_lambda=0.6, dirichlet_alpha=0.1, katz_cutoff=5, kn_discount=0.1, kn_concentration=1.0, tokenize_function=TreebankWordTokenizer().tokenize, normalize_function=lower, ): self._unk_cutoff = unk_cutoff self._jm_lambda = jm_lambda self._dirichlet_alpha = dirichlet_alpha self._katz_cutoff = katz_cutoff self._kn_concentration = kn_concentration self._kn_discount = kn_discount self._vocab_final = False self._tokenizer = tokenize_function self._normalizer = normalize_function # Add your code here! self._vocab_freq = FreqDist() self._gram_freq = FreqDist() self._context_freq = FreqDist() self._vocab_freq[kSTART] += kUNK_CUTOFF + 1 self._vocab_freq[kEND] += kUNK_CUTOFF + 1
def test(): global N, words, network print 'In testing.' gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth.""" tokenizer = RegexpTokenizer('\w+') gettysburg_tokens = tokenizer.tokenize(gettysburg) samples = [] for token in gettysburg_tokens: word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) dist = FreqDist(samples) V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) pred = network.forward(V).w topics = [] while len(topics) != 5: max_act = max(pred) topic_idx = pred.index(max_act) topic = words[topic_idx] if topic in gettysburg_tokens: topics.append(topic) del pred[topic_idx] print 'Topics of the Gettysburg Address:' print topics
def top_words_from_corpus(self, num_words, test_name): corpus_tokens = [] for i in self.corpus_vars["corpus_member_ids"]: title = 'document_' + str(i) doc_tokens = Library.document_instances[title].metadata["tokenized_doc"] corpus_tokens += doc_tokens top_words = [] fdist_corpus = FreqDist(corpus_tokens) fdist_list = fdist_corpus.items() if test_name == "Function Word PCA": function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT'] for i in fdist_list: top_words.append(i[0]) if len(top_words) == num_words: tagged_top = nltk.pos_tag(top_words) for j,k in tagged_top: if k not in function_pos: top_words.remove(j) if len(top_words) == num_words: break elif test_name == "Burrows's Delta": for i in fdist_list: top_words.append(i[0]) if len(top_words) == num_words: break return top_words
def generate_ngrams_profile(self, text, profile_size, min_size=2, max_size=3): """ It reads incoming text, generates all possible N-grams, with sizes ranging between min_size and max_size and counts the occurrences of all N-grams. Parameters ---------- text : unicode profile_size : int min_size : int, optional (default=2) max_size : int, optional (default=3) Returns ------- ngram_profile : FreqDist object """ raw_ngrams = [] text = self.sanitize_text(text) for n in range(min_size, max_size+1): for ngram in ngrams(text, n): raw_ngrams.append(''.join(unicode(i) for i in ngram)) fdist = FreqDist(raw_ngrams) ngram_profile = fdist.most_common(n=profile_size) return ngram_profile
def make_cutOff(flatList, bottomCutOff, topCutOff): ''' INPUT: flatList is a 1-d list of all tokens in set of tweets and both bottom and topCutOff are intergers OUTPUT: newVocab = a 1-d list of all tokens we want to keep thrownOut = a 1-d list of all tokens to throw out ''' fd = FreqDist(flatList) newVocab = [] thrownOut = [] for item in fd.items()[:topCutOff]: # append most common words thrownOut.append(item) for item in fd.items()[topCutOff:]: if item[1] > bottomCutOff: # append good words newVocab.append(item[0]) else: # append uncommon words thrownOut.append(item) print 'Cutoffs made...' return newVocab, thrownOut
def main(): userInput = parser.getInput() fileList = parser.getFiles(userInput['train']) pdata = parser.parseFiles(fileList) allsent = '' for f in pdata: allsent += f[3] all_words = FreqDist(w.lower() for w in word_tokenize(allsent) if w not in stopwords.words('english') ) global top_words top_words = all_words.keys()[:500] # pdata = getParseData() featdata = featureAggregator(pdata) print featdata[:10]
def get_hosts(year): '''Hosts is a list of one or more strings. Do NOT change the name of this function or what it returns.''' # Your code here file_name = 'gg%s.json' % year with open(file_name, 'r') as data: db = json.load(data) hosts = [] pairs = [] for f in db: e = f['text'] if 'and' in e.lower(): for proper in strip_proper_pairs(normalize_str(e).split()): pair = proper.split('and') if len(pair) == 2: if pair[0] != ' ' and pair[1] != ' ': pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' '))) pairs_freq = FreqDist(pairs) if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2: hosts.append(pairs_freq.most_common(10)[1][0][0]) hosts.append(pairs_freq.most_common(10)[1][0][1]) else: hosts.append(pairs_freq.most_common(10)[0][0][0]) hosts.append(pairs_freq.most_common(10)[0][0][1]) return hosts
def BootstrapFD(samp): fd = FreqDist(samp) f1 = float(fd.Nr(1)) f2 = float(fd.Nr(2)) N = float(fd.N()) B = fd.B() # Undetected species & Coverage if f2 > 0.0: f0 = ceil(((N - 1.0) / N) * (f1 ** 2.0) / (2.0 * f2)) C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0 * f2) else: f0 = ceil(((N - 1.0) / N) * f1 * (f1 - 1.0) / 2.0) C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0) # Correct abundances probs = array(fd.values()) / N lambdah = (1 - C) / sum(probs * (1 - probs) ** N) probs = probs * (1 - lambdah * (1 - probs) ** N) # P for unseen # paux = (1-C)/f0 yield fd.values() popO = arange(B) dist = binom(n=N, p=1 - C) probsA = probs / sum(probs) while True: ns2 = dist.rvs() ns1 = int(N) - ns2 if ns1 > 0: samp1 = list(choice(popO, size=ns1, replace=True, p=probsA)) else: samp2 = [] if ns2 > 0: samp2 = list(random_integers(B, B + int(f0) - 1, ns2)) else: samp2 = [] yield FreqDist(samp1 + samp2).values()
def setUpOwnSubjectStopWords(): for topic in topics_table_noun_only_title: #only limiting it to a specified length #might want to look into the numeric part all_description = [ds for ds in topics_table_noun_only_description[topic] if len(ds) > 5].join() all_topics = [topics for topics in topics_table_noun_only_title[topic] if len(ds) > 5].join() fdist_description = FreqDist(all_description) fidst_topics = FreqDist(all_topics) ten_most_common_descr = fdist_description.most_common(10) ten_most_common_topic = fdist_description.most_common(10) built_topic_stop_words[topic] = [word for word,freq in ten_most_common_descr ] built_topic_stop_words[topic].append([word for word, freq in ten_most_common_topic]) #here we set up the top 5-10 words (we need to look into the data more to find #the hard margin of the good numerical value to stop, but for simplicity sake, we #pick 5 for now, let's see how our accuracy changes when change the most frequent words for topic in built_topic_stop_words: print built_topic_stop_words[topic] print "\n"
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def main(): keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination", "Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers", "hacker", "exploit code", "Defense", "Intelligence", "Agency"] file_name = "tweets_output.txt" pickle_words_file = "words.pickle" pickle_words(file_name, pickle_words_file, keyword_list) pickle_tweets_file = "tweets.pickle" pickle_tweets(file_name, pickle_tweets_file) words = load(open("words.pickle")) tweets = load(open("tweets.pickle")) freq_dist = FreqDist(words) print tweets print("===") print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ") print("===") print("Number of words within the twitter search space: ") print(len(words)) print("Number of unique words within twitter search space: ") print(len(set(words))) print("Lexical Diversity of unique words within twitter search space: ") print(lexical_diversity(words)) print("===") print("Conducting Native Language Processing Analysis Utilizing Python NLTK") print("===") print("Top 50 Frequent Words within the Twitter Search Space: ") print(freq_dist.keys()[:50]) print("===") print("Bottom 50 Frequent Words within the Twitter Search Space: ") print(freq_dist.keys()[-50:]) print("===")
def find_names(self): """creates a frequency distribution of the most common names in the texts""" names_list = LIST_OF_NAMES name_tokens = [w for w in self.tokens if w in names_list] fd = FreqDist(name_tokens) return fd.most_common(50)
class Index: """ The Index class stores an index for a document. """ def __init__(self): self._freq_dist = None self._document = None def index(self, document): self._document = document if self._freq_dist == None: self._freq_dist = FreqDist() for term in self.terms(): self._freq_dist.inc(term) def reset(self): "Reset the index" self._freq_dist = None def freq_dist(self): if self._freq_dist == None: self.index() return self._freq_dist # return the number of times a term appears in this document def freq(self, term): if not self._freq_dist: self.index() return self._freq_dist[term] def tf(self, term): if not self._freq_dist: self.index() return float(self._freq_dist[term]) / float(self._freq_dist.N())
def palavrasChaves(self): # fun��o da NLTK que retorna as stopwords na lingua inglesa stopE = stopwords.words('english') # fun��o da NLTK que retorna as stopwords na lingua portuguesa stop = stopwords.words('portuguese') stopS = stopwords.words('spanish') palavrasChaves = [] textoArtigo = [] #retira pontua��es do texto e divide o texto em palavras for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split(): #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado if i not in stop: #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado if i not in stopE: #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�" if i not in stopS: if len(i) > 2: textoArtigo.append(i) # apresenta a frequencia de repeticoes das palavras no corpo do artigo freq = FreqDist(textoArtigo) # separa as quatro palavras mais frequentes items = freq.items()[:4] # coloca as palavras mais frequentes do texto na variavel palavrasChaves for i in range(0,len(items)): palavrasChaves.append(items[i][0]) return palavrasChaves
def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0, max_tables = 50000, sanity_check=False, initialize=False, report_filename="topic_history.txt"): self.max_tables = max_tables self._alphabet = FreqDist() # store all words seen in a list so they are associated with a unique ID. self.initialize_index() self._words = FreqDist() self.alpha_topic = alpha_topic self.alpha_word = alpha_word self._num_updates = 0 self._report = None if report_filename: self._report = open(report_filename, 'w') self.num_topics = num_topics self._topics = [FreqDist() for x in xrange(num_topics)] # the sanity_check flag is for testing only. if initialize and sanity_check == True: self.deterministic_seed() elif initialize: self.initialize_topics()
def get_top_followings(screen_name): # authorize twitter, initialize tweepy api = TwitterGrabber.initialise_api(0) print(api.get_status) # initialize a list to hold all the tweepy Tweets all_tweets = [] # make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=200) # get the user object # user = api.get_user(screen_name=screen_name) # print(user.lists_subscriptions) # save most recent tweets all_tweets.extend(new_tweets) # save the id of the oldest tweet less one oldest = all_tweets[-1].id - 1 # keep grabbing tweets until there are no tweets left to grab while len(new_tweets) < 0: # print("getting tweets before %s" % oldest) # all subsequent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) # save most recent tweets all_tweets.extend(new_tweets) # update the id of the oldest tweet less one oldest = all_tweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(all_tweets))) tweet_text = [] for tweet in all_tweets: tweet_text.append(tweet.text) content = [] retweets = [] for tweet in tweet_text: words = word_tokenize(tweet, 'english') content.extend(strip_words(words)) if words[0] == 'RT': retweets.append(words[2]) tweet_distribution = FreqDist(retweets) print(tweet_distribution.most_common(20)) a = follow_description(api, tweet_distribution.most_common(20), screen_name) return a
def cleaner(filename): textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r') text = [] all_dates = [] complete_text = [] words_list = [] nodes = [] for line in textfile: datetime,chat = line.split('-') date, time = datetime.split(',') loc = chat.find(':') #if len(chat.split(':'))==3: # print chat user,text = chat[:loc],chat[loc+2:] text = text.replace("\n",'') words = text.split(' ') for i in words: words_list.append(i) complete_text.append(text) nodes.append(user) all_dates.append(date) #print set(nodes) #print set(all_dates) fdist = FreqDist(words_list) f1 = fdist.most_common(100) create_csv('wordcloud.csv',f1) textfile.close()
def train_wordfrequency(n_dims = 50): ### Load data dataloader = csv_dataloader() dataloader.load('output/data_cache.pk') print "Read in finished" train_id = dataloader.id _, pos_id, neg_id = dataloader.balance(train_id, 'full') train_data_pos = dataloader.data_retrieve(pos_id) train_data_neg = dataloader.data_retrieve(neg_id) tokens = sum(dataloader.data.viewvalues(), []) tokens_pos = sum(train_data_pos['data'].viewvalues(), []) tokens_neg = sum(train_data_neg['data'].viewvalues(), []) fdist_base = FreqDist(tokens) fdist_pos = FreqDist(tokens_pos) fdist_pos = normalize(fdist_pos, fdist_base) fdist_neg = FreqDist(tokens_neg) fdist_neg = normalize(fdist_neg, fdist_base) print list(fdist_pos.viewkeys())[:100] print list(fdist_neg.viewkeys())[:100] labels_pos = [1] * len(tokens_pos) labels_neg = [0] * len(tokens_neg) labels = labels_pos + labels_neg corpus = tokens_pos + tokens_neg
def follow_description(api, friend_list, screen_name): the_list = [] all_tags = [] for friend in friend_list: username = friend[0] frequency = friend[1] print(username) try: user = api.get_user(screen_name=username) for list_obj in user.lists_memberships(screen_name=username, count=50): for w in list_obj.name.lower().split(" "): # print(w) all_tags.append(w) except TweepError as err: print(err.reason) break # print(all_tags) the_list_name = strip_words(all_tags) the_list_dist = FreqDist(the_list_name) # for w in the_list_dist: # print ('***' + str(w)) print(the_list_dist.most_common(20)) return the_list_dist.most_common(20)
def analyzeTitles(): fulltitles = [] titles = [] with open('../top100clean.csv', 'rb') as bookfile: reader = csv.reader(bookfile) for row in reader: if "..." in row[0]: row[0] = " ".join(row[0].split(" ")[:-1]) words = nltk.word_tokenize(row[0]) for w in words: if w.isalpha() and w.lower() not in ['the','a']: titles.append(w.lower()) fulltitles.append(row[0]) titleset = nltk.Text(titles) wordsintitle = [len(f.split(" ")) for f in fulltitles] wit_fd = FreqDist(wordsintitle) print "\nw.i.t.\tfreq" print "--------------------" for numwords, times in wit_fd.iteritems(): print str(numwords) + "\t" + str(times) print "\n" print "\nword\t\tfreq" print "--------------------" fd = FreqDist(titleset) common_words = fd.most_common(25) for k, v in common_words: print str(k) + "\t\t" + str(v)
def top(self, tokens, lowest_rank=50): ''' A list of the most frequent (non-stopword) tokens ''' from operator import itemgetter content = self.words(tokens) fdist = FreqDist(content) vocab = iter(fdist.keys()) # Forget all previous ranking self.lower_words = {} frequency = 0 while frequency < lowest_rank: try: word = vocab.next() except StopIteration: break word_lower = word.lower() if word_lower in self.lower_words: self.lower_words[word_lower] = self.lower_words[word_lower] + fdist[word] else: self.lower_words[word_lower] = fdist[word] frequency = frequency + 1 # return sorted(self.lower_words, key=itemgetter(1), reverse=True) return map(itemgetter(0), sorted(self.lower_words.items(), key=itemgetter(1), reverse=True))
def bag_of_words(data, label_codebook, feature_codebook, theta): """""" word_dict = Alphabet() stopset = set(stopwords.words('english')) for key, value in data.items(): label_codebook.add(key) for doc in value: doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+")) for word in doc_tokens: if word not in stopset: word_dict.add(word) all_words = word_dict._label_to_index.keys() fdict = FreqDist([w for w in all_words]) word_feature = fdict.keys()[theta:] for word in all_words: if word in word_feature: feature_codebook.add(word) instance_list = {} for label, document_list in data.items(): instance_list[label] = [] for document in document_list: vector = np.zeros(feature_codebook.size()) tokens = set(nltk.regexp_tokenize(document, pattern="\w+")) indice = 0 for word in tokens: if feature_codebook.has_label(word): indice = feature_codebook.get_index(word) vector[indice] = 1. instance_list[label].append(vector) return instance_list
def create_word_freq(db): db = getattr(db, "Posts") #client.command("CREATE CLASS concepted EXTENDS E") client.command("DELETE EDGE concepted") #client.command('create property frequency.freq string') #client.command("DELETE VERTEX frequency") data = db.find().batch_size(50) concept = client.command("SELECT name FROM concept") c = [c.name for c in concept] for d in data: if not 'Body' in d: display= '' else: display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', '')) tokens = nltk.word_tokenize(display) fdist=FreqDist(tokens) i = fdist.most_common() for k in i: if k[0].lower() in c: try: client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1])) except: continue
def process_tweets (hashtag,addl_stops=[]): count=0 good_count=0 words_to_plot=[] #Iterate through all chunked files with relevant hashtag for fname in os.listdir(os.getcwd()): if fname.startswith(hashtag): with open(fname,'r') as data_file: data=data_file.read() # Parse raw string since json.load() approach wasn't working data=data.split("\n\x00,") for tweet in data: count+=1 # Tweets have a well-defined structure, so we can parse them # manually (even though the JSON approach would be cleaner) text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1] # Skip tweets that contain Unicode if text.find('\u')>=0: continue else: good_count+=1 # Tokenize and count word frequency, ignoring case words = word_tokenize(text) clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)] words_to_plot=words_to_plot+clean_words #Create frequency histogram of 50 most common words and print summary of activity fdist=FreqDist(words_to_plot) fdist.plot(50) print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed" return words_to_plot
def featureset(sample): comment, label = sample features = {} # tags = map(lambda statement: map(lambda (w,t):t, statement), comment) words = map(lambda statement: map(lambda (w,t):w, statement), comment) words = sum(words, []) # tags = sum(tags, []) size_= sum([len(word) for word in words]) features['stmt_len'] = len(words)/float(len(comment)) features['word_len'] = size_/float(len(words)) features['size'] = size_ # tags_dist = FreqDist(sum(tags, [])) # for tag in TAGS: # features[tag] = tags_dist.get(tag, 0) dist = FreqDist([word.lower() for word in words]) # num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS])) # features['prob_stop_words'] = num_stop_words/len(words) for word in EN_STOPWORDS: features[word] = dist.get(word, 0)/float(len(words)) features['alwayson'] = 1.0 for language in LANGUAGES: for i in range(1,n+1): word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i) features['w_sim_%d_%s' % (i, language)] = word_sim features['t_sim_%d_%s' % (i, language)] = tag_sim features['c_sim_%d_%s' % (i, language)] = char_sim # features['s_sim_%d_%s' % (i, language)] = w_s_sim return (features, label)
def getTopNFreqWords(textArr,N): fdist = FreqDist(textArr) topWordsWithFreq = fdist.most_common(N) topWords=[] for word in topWordsWithFreq: topWords.append(word[0]) return topWords
def posAnalysis(collection): reviews = collection.find(timeout=False) __reportProgress.counter = 0 skip = 1 for rev in reviews: if skip%200 == 0: print 'skip'+str(skip) __reportProgress() if rev.has_key('tags'): skip += 1 if rev['tags'].has_key('NN'): continue sents = sent_tokenize(rev['text']) tokens = [word for sent in sents for word in word_tokenize(sent)] pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" ']) tag_fd = FreqDist(tag for (word, tag) in pos) tags = dict() for (key,value) in tag_fd.items(): k = key.replace('$','S') out = key.translate(string.maketrans("",""), string.punctuation) if len(out)>0: tags[k] = value collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})
def transmit_vocabulary(t_token, t_lang): languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish'] voc_stopwords = set() if t_lang in languages: voc_stopwords = set(stopwords.words(t_lang)) i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8') lines = i_f.readlines() all_tweets = [] corpus_size = 0 for line in lines: row = line.split('\t') words = word_tokenize(row[1]) all_tweets.extend([w.lower() for w in words]) corpus_size += 1 freq_distribution = FreqDist(all_tweets) cats_vocabulary_elements = [] for word, frequency in freq_distribution.most_common(1000): if word not in voc_stopwords: cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']') cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']' print(cats_vocabulary) result_data = {'token': t_token, 'result': cats_vocabulary} json_data = json.dumps(result_data) results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile') results_request.add_header('Content-Type', 'application/json') results_request.data = json_data.encode('utf-8') urllib2.urlopen(results_request) print('Transmitted vocabulary for token '+t_token) os.remove('csv/' + t_token + '.csv')
# Read input files BNC_file = open("Preprocessed_BNC.txt", "r") CS_file = open("Preprocessed_CS.txt", "r") # tokenize two files BNC_words = word_tokenize(BNC_file.read()) CS_words = word_tokenize(CS_file.read()) # filter words to remove punctuation filter_words = [ ' ', '?', '!', ',', ';', ':', '-', '--', '---', '(', ')', '{', '}', '[', ']', "'", '"', '.', '`', '·', '``', '~', "''" ] # Filtered two datasets to remove punctuations BNC_filtered_word = [w for w in BNC_words if w not in filter_words] CS_filtered_word = [w for w in CS_words if w not in filter_words] # computing frequency for two dataset BNC_frequency_Dist = FreqDist(BNC_filtered_word) CS_frequency_Dist = FreqDist(CS_filtered_word) BNC = BNC_frequency_Dist.most_common() CS = CS_frequency_Dist.most_common() BNC_final_list = find_range_word(BNC) CS_final_list = find_range_word(CS) print_list_word(BNC_final_list, 'B_bnc_output.txt') print_list_word(CS_final_list, 'B_cs_output.txt')
def good_turing_trigram_model(data): trigram_distribution = FreqDist(data) good_turing_trigram = SimpleGoodTuringProbDist(trigram_distribution) return good_turing_trigram
stopwords = set(stopwords.words('english')) common_count = 100 max_n = 3 sentences = [cleaner(s) for s in gutenberg.sents('austen-emma.txt')] join_item = lambda x: ' '.join(x) common_items = {} for n in range(1, max_n+1): all_text = [] for sentence in [s for s in sentences if len(s)>1]: grams = ngrams(sentence, n) for gram in grams: all_text.append(gram) cur_common = [item[0] for item in FreqDist(all_text).most_common(common_count)] if n==1: common_items[n] = [join_item(item) for item in cur_common] else: common_items[n] = [] for item in cur_common: tmp = {x : 0 for x in item} for x in range(1,n): for gram in ngrams(item, x): if join_item(gram) in common_items[x]: for word in gram: tmp[word] = 1 if not reduce(lambda x, y: x*y, tmp.values()): common_items[n].append(join_item(item)) common_items[n] = sorted(common_items[n]) for n, val in enumerate(common_items.values()):
'ADJ_SAT': 's', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v' } # We'll use the reuters corpus in NLTK. # The same steps of preprocessing can be done on documents read in from external files. # How many files are there in the corpus? # What are their categories? Single or multiple categories for one file? len(reuters.fileids()) # cats = [ reuters.categories(f) for f in reuters.fileids() ] # for every file in retuers file reuter list show categories for each of the articles and keep in cats cat_num = [len(c) for c in cats] fd_num = FreqDist(cat_num) fd_num.plot() # How many documents are there in each category? # FreqDist() can be used to find the answer, but we need to flatten the list of categories first. cats_flat = [ c for l in cats for c in l ] # cats contains list of lists ,so flattens each list inside the list fd_cat = FreqDist(cats_flat) fd_cat fd_cat.most_common(20) # Let's pick two categories and visualize the articles in each category using word cloud grain = reuters.fileids('grain') trade = reuters.fileids('trade') #tokenised document
preprocessedStory = preprocess(storytext) tokens = nltk.word_tokenize(preprocessedStory) print(tokens[0:20]) def lexical_diversity(text): return len(set(text)) / len(text) lexical_diversity(tokens) len(tokens) len(set(tokens)) fdist1 = FreqDist(tokens) print(fdist1) fdist1.plot(50, cumulative=True) from nltk.corpus import stopwords stop = stopwords.words('english') remstop = [i for i in tokens if i not in stop] remstop[0:20] len(remstop) len(set(remstop)) lexical_diversity(remstop)
def takeSecond(elem): return elem[1] if __name__ == '__main__': # liste contenant des sous liste de chaque mot avec les pads <s> et </s> pour le debut et la fin de phrase preprocessed = [ pad_both_ends(s.strip(' ').split(' '), n=2) for s in sentences('book') ] # avec flatten() element dans sous liste de preprocessed extrait pour faire une liste tokens = list(flatten(preprocessed)) fd = FreqDist(tokens) # unigramme (dict avec chaque mot et sa frequence model = bigrams(tokens) # model du bigramme cfd = ConditionalFreqDist( model ) # bigramme (dict avec sous dict: clés = next mots et valeur = leur frequence) corpus = {} # dict of corpus traincorpus( ) # make a json file of training of the bigram model from the corpus file (book) # [print(s, o) for s, o in fd.items()] nex = '<s>' sentence = [] while nex != '</s>':
from sklearn.metrics import r2_score, make_scorer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from nltk.stem import WordNetLemmatizer from nltk import FreqDist from nltk.corpus import brown, wordnet from scipy.stats import pearsonr, spearmanr from matplotlib import pyplot as plt wordnet_lemmatizer = WordNetLemmatizer() stopwords = set(nltk.corpus.stopwords.words("english")) tagger = nltk.tag.pos_tag frequency_list = FreqDist(i.lower() for i in brown.words()) all_words_count = 0 for i in frequency_list: all_words_count += frequency_list[i] def get_words(sentence): return [i.strip('., ') for i in sentence.split(' ')] with open('word_to_vec', 'r') as f: embeddings = {} for line in f.readlines(): args = get_words(line.strip("\n\t ")) embeddings[args[0]] = [float(i) for i in args[1:]]
# get sample tweets through remove noise methods adn convert them into words and put into list of positive or # negative positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) # running through the list of words from sample tweets and displays the most common features of the list, # this will give us a visual idea and we can interpret it towards the accuracy of the algorithms freq_dist_pos = FreqDist(all_pos_words) print("The most common 10 words:") print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset # shuffle data set to avoid bayes train and test set, this will ensure that we run aour algo on random data set
sentence_list_nights.append((file, len(corpus_nightsII.sents(file)))) sentence_dic_nights[file] = len(corpus_nightsII.sents(file)) sentence_dic_nights = collections.OrderedDict(sentence_dic_nights) # we make sure that the order of the data stays the same # Which night has the most sentences? for file, characters in sentence_list_nights: if characters == max(sentence_dic_nights.values()): print(file, characters) # the Eight Hundred and Forty-fifth.txt => 399 # In the following block of code, we calculate what the average word length is in each night dict_word_length = {} for file in corpus_nightsII.fileids(): text = corpus_nightsII.words(file) x = [len(words) for words in text] fdist = FreqDist(x) dict_word_length[file] = fdist.max() print(dict_word_length) # We now calculate the readability for each file. We do this by using the Automated Readability Index (ARI). stat_list = [] x = word_dic_nights.keys() for name in x: n_char = char_dict_night[name] n_words = word_dic_nights[name] n_sents = sentence_dic_nights[name] stat_list.append((name, n_char, n_words, n_sents)) print(stat_list) def ARI(n_char, n_words, n_sents):
output.append([token.lemma_ for token in doc if token.pos_ in tags]) return output tokenized_reviews = pd.Series(review).apply(lambda x: x.split()) reviews_2 = lemmatization(tokenized_reviews) reviews_3 = [] for i in range(len(reviews_2)): reviews_3.append(' '.join(reviews_2[i])) review = reviews_3 # 提取所有单词 list_data = [] for i in review: list_data += i.lower().split() # 词频统计 fdist = FreqDist(list_data) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) # 词频可视化 d = words_df.nlargest(columns="count", n=20) plt.figure(figsize=(15, 5)) ax = sns.barplot(data=d, x="word", y="count") ax.set(ylabel='Count') plt.show() # LDA主题建模 import pyLDAvis import pyLDAvis.gensim import gensim from gensim import corpora
allwords.append(j.lower()) # print allwords for i in range(len(allwords)): allwords = [re.sub(r'[^\w\s]', '', s) for s in allwords] allwords = set(allwords) allwords = list(allwords) y = np.zeros(len(allwords)) # print allwords for i in range(len(allwords)): try: y[i] = int((complexity[allwords[i]])) except: y[i] = 0 # print y fdist = FreqDist(brown.words()) x = [] for i in range(len(allwords)): x.append([]) for i in range(len(allwords)): x[i].append(fdist.freq(allwords[i])) x[i].append(len(allwords[i])) x[i].append(synobj.synCount(allwords[i])) x[i].append(ww.wdweight(allwords[i])) x[i].append(vc.vCount(allwords[i])) x[i].append(synobj.len_of_synonyms(allwords[i])) classifier = DecisionTreeClassifier() classify = classifier.fit((x[0:int(len(x) * 0.8)]), y[0:int(len(y) * .8)]) ypred = classifier.predict(XTest)
for index, item in market_basket_0.loc[i].items(): if item != 0: temp_set.add(item) transactions.append(temp_set) # print(transactions) # 提取所有产品并按照索引依次堆叠组成list,索引组成list indexid = [] item = [] for m in range(len(transactions)): for x in transactions[m]: indexid.append(m) item.append(x) # 计算词频 fre = FreqDist(item) print(fre.most_common(10)) # 生成list list_fre = list(fre.most_common(10)) list_item = [] list_count = [] for x in list_fre: list_item.append(x[0]) list_count.append(x[1]) # 频率分布图 fre.tabulate(10) fre.plot(10) # 饼图..
from nltk import FreqDist from common.books import text1 fdist = FreqDist(len(w) for w in text1()) print(fdist) # print(fdist.keys()) # print(fdist.items()) print(fdist.most_common()) print(fdist.max()) print(fdist[3]) print(fdist.freq(3))
# In[15]: tokens = nltk.word_tokenize(raw) type(tokens) # In[16]: words1 = [w.lower() for w in tokens] #list comprehension #only keep text words, no numbers words2 = [w for w in words1 if w.isalpha()] # In[17]: freq = FreqDist(words2) sorted_freq = sorted(freq.items(), key=lambda k: k[1], reverse=True) sorted_freq # In[31]: freq.plot(30) # In[32]: from nltk.corpus import stopwords stopwords = stopwords.words('english') # In[33]: words_nostopwords = [w for w in words2 if w not in stopwords]
import nltk from nltk import FreqDist, NaiveBayesClassifier from nltk.corpus import movie_reviews import random from nltk.corpus import wordnet as wn from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] all_words = FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_words)[:2000] stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = (word in document_words) if len(word) > 2: synsets = wn.synsets(word) if synsets: for synset in synsets: for lemma in synset.lemma_names(): if "_" not in lemma: features['synset({})'.format(lemma)] = ( lemma in document_words) return features
def create_training_examples(statement_list: List[List[str]], trees, training_spans, training: bool, max_span_length, num_sentence_words, args, pos_tags, constituents, k=1): """ This function defines a training set for span prediction. I computes features (inputs) for eveyspans. Multiple spans are extracted for a given statement, and features are computed for each. :input statement_list: list of token lists. :input trees: parse tree list, aligned with statement_list :training: bool, indicates whether annotation (True/False) is computed. :k: int, number of negative span examples per positive. Default: 1 (1:1 ratio) """ print('building features...') word_frequencies = FreqDist([w.lower() for w in reuters.words()]) # list of science tokens & science multiword expressions science_tokens = get_science_terms(args.data_path) science_expressions = get_science_terms(args.data_path, False) stop_words = set(stopwords.words('english')) true_examples, false_examples, examples_per_sentence = [], [], [] span_indexes = [] # loop over all statements for i, statement in enumerate(statement_list): print(i) span_index = [] #enumerating all spans for this sentence. pos_tags_this_statement = pos_tag(statement) sentence_word_frequencies = [word_frequencies.freq(token.lower()) \ for token in statement] tree = trees[i] false_examples_this_instance = [] sentence_candidate_span_examples = [] # when training. # loop across different spans for given sentence for span in legal_spans(num_sentence_words, max_span_length): #globally legal if span[1] > len(statement): continue if span[0] > len(statement): break span_index.append(span) # extract scalar features for this span. [position will not be used] f_bias = 1 f_length = span[1] - span[0] #f_begin = span[0] #f_end = span[1] #f_dist_to_end0 = len(statement) - span[0] #f_dist_to_end1 = len(statement) - span[1] # list of tokens of this span span_tokens = statement[span[0]:span[1]] # feature: span contains at least one science token f_science_token = bool( \ set(span_tokens).intersection(science_expressions)) f_science_token_count = 0 # counting # of science tokens in span max_token_length = 0 # in this span. for token in span_tokens: f_science_token_count += int(token in science_tokens) max_token_length = max(max_token_length, len(token)) f_max_token_length = np.log(max_token_length) # feature: relative word frequency average # with numerical stability/ avoiding -inf f_avg_word_frequency = 1e-10 + np.mean( sentence_word_frequencies[span[0]:span[1]]) f_avg_word_frequency = np.log(f_avg_word_frequency) # feature: begin with stop word? f_stop_word_begin = bool(span_tokens[0] in stop_words) # POS indicator (one-hot) f_pos = np.zeros([len(pos_tags)]) # Bag-of-POS-tags for this span. for token, tag in pos_tags_this_statement[span[0]:span[1]]: f_pos[pos_tags.index(tag)] += 1.0 # feature: POS indicator for span beginning f_pos_beginning = np.zeros([len(pos_tags)]) f_pos_beginning[pos_tags.index( pos_tags_this_statement[span[0]][1])] = 1.0 # feature: POS indicator for span end f_pos_end = np.zeros([len(pos_tags)]) f_pos_end[pos_tags.index(pos_tags_this_statement[span[1] - 1][1])] = 1.0 # feature: POS bigram indicator # define extended POS tag set with additional begin and end symbols for bigrams. # pos_tags_bigram = pos_tags + ["POS_BEGIN", "POS_END"] # for POS bigrams. # pos_tags_square = [x for x in product(pos_tags_bigram, pos_tags_bigram)] # f_pos_bigram = np.zeros([len(pos_tags_square)]) # obtaining the POS bigram # for position in range(-1, f_length): # boundary cases: start of span and end of span. # if position == -1: # tag1 = 'POS_BEGIN' # _, tag2 = pos_tags_this_statement[span[0]] # elif position == f_length -1: # _, tag1 = pos_tags_this_statement[span[0]+position] # tag2 = 'POS_END' # #normal case: inside span. # else: # _, tag1 = pos_tags_this_statement[span[0] + position] # _, tag2 = pos_tags_this_statement[span[0] + position + 1] # # f_pos_bigram[pos_tags_square.index( ( tag1, tag2 ) )] += 1.0 # constituent tree features tree_position = tree.treeposition_spanning_leaves(span[0], span[1]) # smallest subtree in constituent parse, containing this span. smallest_subtree = tree[tree_position[:-1]] constituent_tag = smallest_subtree.label() # feature: is this span a constituent parse subtree span? f_span_match = bool(span[1] - span[0] == len(smallest_subtree)) # constituency parse label indicator f_span_constituent = np.zeros([len(constituents)]) f_span_constituent[constituents.index(constituent_tag)] = 1.0 # constituency parse label indicator with indication for large spans. f_span_constituent_big = np.zeros([len(constituents)]) f_span_constituent_big[constituents.index(constituent_tag)] = ( f_length > 2) # leave out position features: #### f_begin, f_end, f_dist_to_end0, f_dist_to_end1, #now collect all features: f_scalars = np.array([ f_bias, f_span_match, f_length, f_science_token, f_avg_word_frequency, f_stop_word_begin, f_max_token_length, f_science_token_count ]) # these are all features for this span, in a np array. feature_vector = np.concatenate( (f_scalars, f_pos, f_pos_beginning, f_pos_end, f_span_constituent, f_span_constituent_big)) # provide True/False annotation in case the data is used for training. if training: if span == training_spans[i]: #positive example true_examples.append(feature_vector) sentence_candidate_span_examples.append( (feature_vector, True)) else: #negative example false_examples_this_instance.append(feature_vector) sentence_candidate_span_examples.append( (feature_vector, False)) else: sentence_candidate_span_examples.append(feature_vector) span_indexes.append(span_index) examples_per_sentence.append(sentence_candidate_span_examples) # select at random k negative spans as training examples. default 1:1 if training: for random_index in np.random.randint( 0, len(false_examples_this_instance), k): false_examples.append( false_examples_this_instance[random_index]) print(len(true_examples), 'True span examples.') print(len(false_examples), 'False span examples.') # collect true and false examples [inputs] all_examples = np.concatenate( (np.asarray(false_examples), np.asarray(true_examples))) # collect annotations for each example (True/False target outputs) false_span_labels = np.zeros([len(false_examples)]) true_span_labels = np.ones([len(true_examples)]) all_labels = np.concatenate((false_span_labels, true_span_labels)) return all_examples, all_labels, examples_per_sentence, span_indexes
print(vocab["barber"]) vocab_size = 5 vocab = vocab.most_common(vocab_size) vocab word_to_index = {} i = 0 for (word, frequency) in vocab: i = i + 1 word_to_index[word] = i print(word_to_index) #%% NLTK의 FreqDist 사용하기 from nltk import FreqDist import numpy as np vocab = FreqDist(np.hstack(sentences)) print(vocab["barber"]) vocab_size = 5 vocab = vocab.most_common(vocab_size) vocab word_to_index = {word[0]: index + 1 for index, word in enumerate(vocab)} print(word_to_index) #%% keras의 텍스트 전처리 from tensorflow.keras.preprocessing.text import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(sentences) print(tokenizer.word_index) print(tokenizer.word_counts)
nlz_words3 = [ word for word in nlz_words2 if not (len(word) == 1 and not is_num(word)) ] return nlz_words3 text = data_collector.load_data('apple', 'keyword') label = stock_data.stock_price_label('AAPL', 14, 5) inputs = data_collector.merge_price_text(text, label).values nlz_inputs = [([word for word in normalizing(words)], tuple(label)) for (words, label) in inputs] inputs = [(words, tuple(label)) for (words, label) in inputs] all_words = list(itertools.chain(*[words for (words, _) in nlz_inputs])) fd = FreqDist(all_words) word_features = [word for (word, _) in fd.most_common(2000)] def features_contain(words): ''' A feature extractor whose features indicate whether or not individual words are present in a given words. return example: {'contain(apple)': True, 'contain(banana)': False, ...} :param words: A list of words :type words: List :return: Features that indicate whether or not individual words are present in a given words
from nltk import FreqDist text = 'Hello ! This is a course designed for people who are interested in learning the core concepts of NLP and ' \ 'utilising those concepts to make applications to perform sentiment analysis analysis' # Freq Dist - input list text_list = text.split(' ') print(text_list) freqDist = FreqDist(text_list) words = list(freqDist.keys()) print(words) print(freqDist['analysis'])
def frequent(context): freq = FreqDist(context) return freq
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens = [] negative_cleaned_tokens = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens) # Get the frequency of words. freq_dist_pos = FreqDist(all_pos_words) def get_tweets_for_model(cleaned_tokens_list): for tweet_tokens in cleaned_tokens_list: yield dict([token, True] for token in tweet_tokens) # Convert list of words to dictionary with words as keys and True as values positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens) # Attatch a positive or negative label to each tweet. positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative")
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') # In[29]: # Remove noise (normalize + stop word removal) positive_cleaned_tokens = remove_noise(positive_tweet_tokens) negative_cleaned_tokens = remove_noise(negative_tweet_tokens) # In[30]: # Word frequency distributions positive_freq_dist = FreqDist(get_all_words(positive_cleaned_tokens)) negative_freq_dist = FreqDist(get_all_words(negative_cleaned_tokens)) print(positive_freq_dist.most_common(10)) print(negative_freq_dist.most_common(10)) # In[31]: # Convert data to NLTK-required format positive_dataset = get_dataset_from_tokens(positive_cleaned_tokens, "Positive") negative_dataset = get_dataset_from_tokens(negative_cleaned_tokens, "Negative") dataset = positive_dataset + negative_dataset # In[32]:
def cptj(x):#词频统计 from nltk import FreqDist ciping=FreqDist(x) return ciping
#!/usr/bin/python3 # coding: utf-8 import nltk from nltk.corpus import gutenberg # 导入 gutenberg 集 ################################################################## ## FreqDist 跟踪分布中的采样频率 (sample frequencies) from nltk import FreqDist # 导入 FreqDist 类 fd = FreqDist( gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token print( fd ) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token print(type(fd)) # <class 'nltk.probability.FreqDist'> print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典 print(fd.N()) # 98171; 是单词, 不是字母, 有重复的 print(fd.B() ) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中 print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'> print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出 print(fd.max()) # 频率最高的一个词 print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171 print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词 # 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征 for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的 if idx == 5: break print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print(
from nltk.corpus import brown print(brown.categories()) print(brown.words(categories='news')) print(brown.words(fileids=['cg22'])) print(brown.sents(categories=['news', 'editorial', 'reviews'])) from nltk import FreqDist news = brown.words(categories='news') fdist = FreqDist([w.lower() for w in news]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print('{0}: {1}'.format(m, fdist[m]))
def lesk_text(t, s): # LESK aplicado entre dois textos, para funcionar genericamente t_def = FreqDist(prep_text(t).split()) s_def = FreqDist(prep_text(s).split()) intersection = (t_def) & (s_def) return len(intersection)
texts[:300] # 불용어 제거 with open('./stopwords.txt', 'r', encoding='UTF-8') as f: stopwords = f.read() stopwords = stopwords.split(' ') stopwords[:10] texts = [text for text in texts if text not in stopwords] # 원본에서 불용어 파일에 존재하지 않는 단어들만 추출하라 import pandas as pd from nltk import FreqDist freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False) freqtxt[:25] # 판다스를 활용하여 상위 빈도 단어를 추출한다 from konlpy.tag import Okt # stem 어간.. 의미를 가지는 단어 # tag 문법.. 명사, 동사, ... okt.pos('가치창출') okt.pos('갤럭시') # 워드 클라우드 출력 from wordcloud import WordCloud wcloud = WordCloud(ctx + "D2Coding.ttf", relative_scaling=0.2,
def load_data(source, dist, vocab_size=10000, limit=None): # Reading raw text from source and destination files f = open(source, 'r') X_data = f.read() f.close() f = open(dist, 'r') y_data = f.read() f.close() print('raw data read') if limit is not None: X_data = X_data[:limit] y_data = y_data[:limit] # Splitting raw text into array of sequences X = [ text_to_word_sequence(x) for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 ] y = [ text_to_word_sequence(y) for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 ] # Creating the vocabulary set with the most common words (leaving room for PAD, START, UNK) dist = FreqDist(np.hstack(X)) X_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS)) dist = FreqDist(np.hstack(y)) y_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS)) # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary X_ix_to_word = [word[0] for word in X_vocab] # Adding the word "ZERO" to the beginning of the array X_ix_to_word = EXTRA_SYMBOLS + X_ix_to_word # Creating the word-to-index dictionary from the array created above X_word_to_ix = {word: ix for ix, word in enumerate(X_ix_to_word)} # print(X_word_to_ix['<PAD>']) # print(X_word_to_ix['the']) # print(X_word_to_ix['session']) # print(X_word_to_ix['resumption']) # Converting each word to its index value for i, sentence in enumerate(X): for j, word in enumerate(sentence): if word in X_word_to_ix: X[i][j] = X_word_to_ix[word] else: X[i][j] = X_word_to_ix['<UNK>'] # for s in range(3): # print('___ ', ' '.join(X_ix_to_word[id] for id in X[s])) y_ix_to_word = [word[0] for word in y_vocab] y_ix_to_word = EXTRA_SYMBOLS + y_ix_to_word y_word_to_ix = {word: ix for ix, word in enumerate(y_ix_to_word)} for i, sentence in enumerate(y): for j, word in enumerate(sentence): if word in y_word_to_ix: y[i][j] = y_word_to_ix[word] else: y[i][j] = y_word_to_ix['<UNK>'] return X, len(X_vocab)+2, X_word_to_ix, X_ix_to_word, \ y, len(y_vocab)+2, y_word_to_ix, y_ix_to_word
def suggest(): #get language lang = request.args.get('lang', 'en') import nltk nltk.download('punkt') if lang == 'en': word_column_names = ['Count', 'Word'] if lang == 'fr': word_column_names = ['Nombre', 'Mot'] #get url url = request.args.get('url', 'https://www.canada.ca/en.html') #get the html from the URL import requests r = requests.get(url) html = r.text #get the html content as text - get content from the "main" tag from bs4 import BeautifulSoup original_soup = BeautifulSoup(html, features="lxml").find('main') original_text = original_soup.get_text() original_text = original_text.replace('..', '.') original_text = original_text.replace('.', '. ') original_text = original_text[:original_text.find("defPreFooter")] original_text = original_text.replace('\n', '') original_text = original_text.replace('\t', '') original_text = original_text.replace('\r', '') #get initial readability total_score from readability import Readability r_o = Readability(original_text) original_fk = r_o.flesch_kincaid() original_score = original_fk.score original_score = format(original_score, '.2f') #add periods after bullet points and headings so that the Flesch Kicaid score considers them as sentences html1 = html.replace("</li>", ".</li>") html2 = html1.replace("</h1>", ".</h1>") html3 = html2.replace("</h2>", ".</h2>") html4 = html3.replace("</h3>", ".</h3>") html5 = html4.replace("</h4>", ".</h4>") html6 = html5.replace("</h5>", ".</h5>") html7 = html6.replace("</h6>", ".</h6>") #get adjusted readability total_score revised_soup = BeautifulSoup(html7, features="lxml").find('main') for t in revised_soup.select('table'): t.extract() revised_text = revised_soup.get_text() revised_text = revised_text.replace('..', '.') revised_text = revised_text .replace('.', '. ') revised_text = revised_text[:revised_text.find("defPreFooter")] revised_text = revised_text.replace('\n', '') revised_text = revised_text.replace('\t', '') revised_text = revised_text.replace('\r', '') from readability import Readability r_f = Readability(revised_text) final_fk = r_f.flesch_kincaid() #tokenize the text for processing from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('\w+') tokens = tokenizer.tokenize(revised_text) words = [] for word in tokens: words.append(word.lower()) #remove stop words from the tokens to get only the meaningful words nltk.download('stopwords') sw_en = nltk.corpus.stopwords.words('english') words_ns_en = [] for word in words: if word not in sw_en: words_ns_en.append(word) #get the 15 most used words in the text from nltk import FreqDist fdist1_en = FreqDist(words_ns_en) most_common_en = fdist1_en.most_common(20) mc_en = pd.DataFrame(most_common_en, columns =['Word', 'Count']) mc_en = mc_en[['Count', 'Word']] sw_fr = nltk.corpus.stopwords.words('french') words_ns_fr = [] for word in words: if word not in sw_fr: words_ns_fr.append(word) #get the 15 most used words in the text from nltk import FreqDist fdist1_fr = FreqDist(words_ns_fr) most_common_fr = fdist1_fr.most_common(20) mc_fr = pd.DataFrame(most_common_fr, columns =['Mot', 'Nombre']) #get all headings and calculate how many words on average between headings headings = original_soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) len_headings = len(headings) hratio = len(words)/(len(headings)) #get all paragraphs and all bulleted list, and calculate how many words per paragraph on average paragraphs = original_soup.findAll(['p', 'ul']) len_par = len(paragraphs) pratio = (len(words)/len(paragraphs)) #calculate points for readability if final_fk.score <= 6: fkpoints = 60 elif final_fk.score >= 18: fkpoints = 0 else : fkpoints = (60-((final_fk.score-6)*5)) #calculate points for number of words between headings if hratio <= 40: hpoints = 20 elif hratio >= 200: hpoints = 0 else : hpoints = (20-((hratio-40)*0.125 )) #calculate points for number of words per paragraph if pratio <= 30: ppoints = 20 elif pratio >= 80: ppoints = 0 else : ppoints = (20-((pratio-30)*0.4)) #add all points total_score = fkpoints+hpoints+ppoints total_score = format(total_score, '.2f') fkpoints = format(fkpoints, '.2f') final_fk_score = format(final_fk.score, '.2f') hpoints = format(hpoints, '.2f') hratio = format(hratio, '.2f') ppoints = format(ppoints, '.2f') pratio = format(pratio, '.2f') total_words = len(words) total_score = float(total_score) if total_score >= 90: if lang=='en': score = 'Outstanding!' if lang=='fr': score = 'Excellent!' elif total_score >= 80 and total_score < 90: if lang=='en': score = 'Very good!' if lang=='fr': score = 'Très bien!' elif total_score >= 70 and total_score < 80: if lang=='en': score = 'Not too bad' if lang=='fr': score = 'Pas mal' elif total_score >= 60 and total_score < 70: if lang=='en': score = 'Needs work' if lang=='fr': score = 'À travailler' elif total_score >= 50 and total_score < 60: if lang=='en': score = 'Needs a lot of work' if lang=='fr': score = 'Besoin de beaucoup de travail' elif total_score < 50: if lang=='en': score = "Please don't do this to people..." if lang=='fr': score = "S'il vous plaît, il faut faire quelque chose..." if lang == "en": return render_template("read_score_en.html", total_score = total_score, fkpoints = fkpoints, final_fk_score = final_fk_score, hpoints = hpoints, hratio = hratio, ppoints = ppoints, pratio = pratio, total_words = total_words, url = url, lang = lang, word_column_names = word_column_names, row_data_word_en = list(mc_en.values.tolist()), row_data_word_fr = list(mc_fr.values.tolist()), zip = zip, score = score, len_headings = len_headings, len_par = len_par, original_score = original_score) if lang == "fr": return render_template("read_score_fr.html", total_score = total_score, fkpoints = fkpoints, final_fk_score = final_fk_score, hpoints = hpoints, hratio = hratio, ppoints = ppoints, pratio = pratio, total_words = total_words, url = url, lang = lang, word_column_names = word_column_names, row_data_word_en = list(mc_en.values.tolist()), row_data_word_fr = list(mc_fr.values.tolist()), zip = zip, score = score, len_headings = len_headings, len_par = len_par, original_score = original_score)
def wordcloud_generator(text): nltk.download('stopwords') nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') nltk.download('movie_reviews') nltk.download('punkt') tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) print("tokens created...") stop_words = stopwords.words('english') filtered_token = [] for w in tokens: if w not in stop_words and len(w) > 3: filtered_token.append(w) print("stop words removed...") lemmatizer = WordNetLemmatizer() lemmatized_filtered_token = [] for w in filtered_token: if len(w) > 3: lemmatized_filtered_token.append(lemmatizer.lemmatize(w)) pos_tagged_token = nltk.pos_tag(lemmatized_filtered_token) adjective_tokens_0 = [] for w in pos_tagged_token: if w[1] == 'JJ' and len(w[0]) > 3: adjective_tokens_0.append(w[0]) print("Level 1 Adjective sorting done...") x = nltk.pos_tag(adjective_tokens_0) adjective_tokens_1 = [] for w in x: if w[1] == 'JJ' and len(w[0]) > 3: adjective_tokens_1.append(w[0]) print("Level 2 Adjective sorting done...") y = nltk.pos_tag(adjective_tokens_1) adjective_tokens_2 = [] for w in y: if w[1] == 'JJ' and len(w[0]) > 3: adjective_tokens_2.append(w[0]) print("Level 3 Adjective sorting done...") freq_dist = FreqDist(adjective_tokens_2) common_words = freq_dist.most_common(50) max_freq_list = [] for w in common_words: max_freq_list.append(w[0]) print( "50 most common words selected for colour sorting... Polarity Finding function called..." ) word_polarity(max_freq_list) color_to_words = {'#00ff00': pos_word_list, 'red': neg_word_list} default_color = 'grey' print("Colours associated with given words...") grouped_color_func = GroupedColorFunc(color_to_words, default_color) print("Calling Wordcloud Creator...") myimage = calc_freq(adjective_tokens_2, grouped_color_func) print("DISPLAYING THE WORDCLOUD !!") plt.figure(figsize=(20, 10), facecolor='k') plt.imshow(myimage) plt.axis('off') plt.show()