def get_word_features(wordlist): wordlist = FreqDist(wordlist) word_features = wordlist.keys() return word_features
def main(): keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination", "Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers", "hacker", "exploit code", "Defense", "Intelligence", "Agency"] file_name = "tweets_output.txt" pickle_words_file = "words.pickle" pickle_words(file_name, pickle_words_file, keyword_list) pickle_tweets_file = "tweets.pickle" pickle_tweets(file_name, pickle_tweets_file) words = load(open("words.pickle")) tweets = load(open("tweets.pickle")) freq_dist = FreqDist(words) print tweets print("===") print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ") print("===") print("Number of words within the twitter search space: ") print(len(words)) print("Number of unique words within twitter search space: ") print(len(set(words))) print("Lexical Diversity of unique words within twitter search space: ") print(lexical_diversity(words)) print("===") print("Conducting Native Language Processing Analysis Utilizing Python NLTK") print("===") print("Top 50 Frequent Words within the Twitter Search Space: ") print(freq_dist.keys()[:50]) print("===") print("Bottom 50 Frequent Words within the Twitter Search Space: ") print(freq_dist.keys()[-50:]) print("===")
def work_1(): file_string = "" txt_file = open("trabalho1.txt", "r+") csv_file = open("trabalho1.csv", "w+") csv_manage = csv.writer(csv_file, delimiter=";", quoting=csv.QUOTE_MINIMAL) base_text = txt_file.read() sentences = word_tokenize(base_text) frequency = FreqDist(sentences) print("texto : {0}".format(base_text)) print("Total de palavras : {0}".format(frequency.N())) print("Total de Termos : {0}".format(len(frequency.keys()))) print("") print("Tabela de Frequência de Termos") print("") for key in frequency.keys(): csv_manage.writerow([key, str(frequency.get(key))]) print("Termo: {0} Total: {1}".format(key, str(frequency.get(key)))) pdfOutput = PdfOutput(frequency, frequency.N(), len(frequency.keys()), base_text) servicePdfManager = ServiceManagerPdf() servicePdfManager.writePdf(pdfOutput) txt_file.close() csv_file.close()
def prepare_pos_features(Language_model_set, output_file): corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set texts = PlaintextCorpusReader(corpus_root, '.*') text = texts.words() tagged_text = nltk.pos_tag(text) merged_tag_text = mergeTags(tagged_text) lists = seperate_pos(merged_tag_text) nouns_dist = FreqDist(lists[0]) top_nouns = nouns_dist.keys()[:200] verbs_dist = FreqDist(lists[1]) top_verbs =verbs_dist.keys()[:200] advs_dist = FreqDist(lists[2]) top_advs =advs_dist.keys()[:100] prep_dist = FreqDist(lists[3]) top_preps =prep_dist.keys()[:100] adjs_dist = FreqDist(lists[4]) top_adjs =adjs_dist.keys()[:200] out = open(output_file, 'w') for n in top_nouns: out.write('NN'+ n + '\n') for v in top_verbs: out.write('VV'+ v + '\n') for av in top_advs: out.write('ADV'+ av + '\n') for p in top_preps: out.write('PREP'+ p + '\n') for aj in top_adjs: out.write('ADJ'+ aj + '\n')
def frequent_words(x, terms=30): all_words = ' '.join([text for text in x]) all_words = all_words.split() freq_dist = FreqDist(all_words) x = transformer.transform( word.replace("_", " ") for word in freq_dist.keys()) words_df = pd.DataFrame({ 'word': list(freq_dist.keys()), 'count': list(freq_dist.values()), 'vector': list(x) }) good = [] bad = [] for i in range(1, len(words_df)): if (nb.predict(words_df.at[i, 'vector']) == 5): good.append([ words_df.at[i, 'count'], words_df.at[i, 'word'].replace(" ", "_") ]) else: bad.append([ words_df.at[i, 'count'], words_df.at[i, 'word'].replace(" ", "_") ]) good = sorted(good, key=lambda x: x[0], reverse=True) bad = sorted(bad, key=lambda x: x[0], reverse=True) return format_result(good, bad, terms)
def generate_vocab(tokens: list, min_token_len: int = 2, threshold: int = 2, remove_numbers=True): freq_dist = FreqDist(tokens) if remove_numbers: remove_digit_tokens(freq_dist) tokens = preprocess_tokens(tokens=list(freq_dist.keys()), min_token_len=min_token_len) removed_tokens = set(freq_dist.keys()).difference(tokens) for t in removed_tokens: freq_dist.pop(t, None) [freq_dist.pop(t, None) for t in tokens if freq_dist[t] < threshold] return freq_dist
def extract_most_common_words(self, words, sentiment): word_freq = FreqDist(words) print("for the sentiment", sentiment) print("there are", len(word_freq.keys()), "different words") print("that were used", sum(word_freq.values()), "times") df = pd.DataFrame({ f'{sentiment}_words': list(word_freq.keys()), f'{sentiment}_counts': list(word_freq.values()) }) df = df.nlargest(self.n_words, columns=f'{sentiment}_counts') df.reset_index(drop=True, inplace=True) return df, len(word_freq.keys()), sum(word_freq.values())
def handle(self, *args, **options): fdist = FreqDist() print "Analyzing raw data" limit = 10 if args: raw_datas = RawData.objects.filter(pk__in=args) else: raw_datas = RawData.objects.all()[:limit] tagged_data = [] for raw_data in raw_datas: words = nltk.word_tokenize(raw_data.data) tagged_data.extend(nltk.pos_tag(words)) for word in words: word = word.strip() if word: fdist.inc(word) print "Anaylzed %s items" % len(raw_datas) print print "Top word: %s" % fdist.max() print print "Top 10 words" for word in fdist.keys()[:10]: times = fdist[word] print " -- %s occurred %s times" % (word, times) print print "Bottom 10 words" for word in fdist.keys()[-10:]: times = fdist[word] print " -- %s occurred %s times" % (word, times) print print "Words occurring between 50-100 times" words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ] print ", ".join(words) cfdist = ConditionalFreqDist() for (word, tag) in tagged_data: cfdist[tag].inc(word) print "Most popular noun: %s" % cfdist["NN"].max() print print "Top 50 nouns" for word in cfdist["NN"].keys()[:50]: times = cfdist["NN"][word] print " -- %s occurred %s times" % (word, times) print
def term_ratio(tf1: FreqDist, tf2: FreqDist, c=None, normalize=False): if normalize: if c is None: c = 1e-4 return { word: (tf1[word] / tf1.N()) / (tf2[word] / tf2.N() + c) for word in tf1.keys() } else: if c is None: c = 1 return {word: tf1[word] / (tf2[word] + c) for word in tf1.keys()}
def entropy(tokens): """ Get the Shannon entropy of a document using it's token distribution :param tokens: A document represented as a list of tokens. :return: """ doc_len = len(tokens) frq = FreqDist(tokens) for key in frq.keys(): frq[key] /= doc_len ent = 0.0 for key in frq.keys(): ent += frq[key] * math.log(frq[key], 2) ent = -ent return ent
def parse(filename): outfilename = filename + ".freq" entry_string = open(filename, 'r').read() # convert to lower case entry_string = entry_string.lower() # remove punctuation for c in string.punctuation: entry_string = entry_string.replace(c, " ") # remove everything except letters and spaces entry_string = re.sub("[^a-z ]", " ", entry_string) # strip out multiple spaces entry_string = re.sub(r'\s+', r' ', entry_string) # make the string into a list and remove stopwords from it entry_string_split = entry_string.split() entry_string_no_stopwords = remove_stopwords(entry_string_split) fd = FreqDist(entry_string_no_stopwords) fout = open(outfilename, "w") sys.stdout.write(outfilename + "\n") fout.write(" ".join(fd.keys())) fout.close()
def features(word_list): freq = FreqDist(word_list) f = freq.keys() return { 'biology': 'biolog' in word_list, 'engineering': 'engin' in word_list, 'animal' : 'anim' in word_list, 'behavior': 'behavy' in word_list, 'chemistry': 'chem' in word_list, 'health': 'heal' in word_list, 'physics': 'phys' in word_list, 'math': 'math' in word_list, 'plant': 'plant' in word_list, 'earth': 'earth' in word_list, 'biochemistry': 'biochem' in word_list, 'social': 'soc' in word_list, 'planet': 'planet' in word_list, 'temperature': 'temperature' in word_list, 'blood': 'blood' in word_list, 'tube': 'tube' in word_list, 'pyschology': 'pyscholog' in word_list, 'protein': 'protein' in word_list, 'gene': 'gen' in word_list, 'most_0': f[0], 'most_1': f[1], 'most_2': f[2], 'most_3': f[3], 'most_4': f[4], 'most_5': f[5], 'most_6': f[6], 'most_7': f[7], }
def generate_corpus(folder_name, top, n): '''corpus of words generated to be used as the vocabulary. Function takes into account topn and will create a corpus with the topn amount of tokens if topn is True.''' lower = True #activates lowercase tokens subfolders = [i for i in os.listdir(folder_name) ] #iterates through subfolder corpus_list = [] for i in subfolders: for v in os.listdir(folder_name + "/" + i): text = open_text(i, folder_name, v, lower) corpus_list += [i for i in text] corpus_freqs = FreqDist(corpus_list) sorted_x = sorted(corpus_freqs.items(), key=operator.itemgetter(1), reverse=True) if top == True: topn_words = {} for i in sorted_x[:n]: topn_words[i[0]] = 0 vocabulary = list(sorted(topn_words.keys())) return topn_words, vocabulary #empty topn dictionary to be used to populate vectors and vocabulary for columns else: vocabulary = list(sorted(corpus_freqs.keys())) corpus = {str(i): 0 for i in sorted(vocabulary)} return corpus, vocabulary
def analyze(inputfile): file = open(inputfile, "rt") text = file.read() file.close() # split into words tokens = word_tokenize(text) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] junk_words = ['nt'] words = [w for w in words if not w in junk_words] print(words[:100]) freqDist = FreqDist(words) words = list(freqDist.keys()) print(freqDist.plot(50))
def compress_term_matrix(matrix, words): initials = [item[0] for item in words] fdist = FreqDist(initials) letterindices = [] for letter in sorted(fdist.keys()): letterindices.append((letter, fdist[letter])) indexmatrix = [] start = 0 for letter, occ in letterindices: newocc = occ / 5 print letter," ",occ print " range: ", start," ", start+occ," ",newocc indexes = np.random.random_integers(start, start+occ, newocc) indexmatrix.append((letter, indexes.tolist())) start = start+ occ allindices = [] for _,v in indexmatrix: allindices.extend(v) smatrix = matrix[allindices, :] return indexmatrix, smatrix
def preprocess_data(train_file): # Read the trainset data_train = pd.read_csv(train_file, header=0) X_train = data_train.mr.tolist() # convert rm part to a list y_train = data_train.ref.tolist() # convert ref part to a list # Preprocess X_train_seq, y_train_seq, dico_ = mr2oh( X_train, y_train) # convert train and test sets into lists of slots and values. y_train = [proc_text(y) for y in y_train_seq] # process text X_train = [proc_text(y) for y in X_train_seq] # process text dist = FreqDist(np.concatenate(y_train + X_train)) i_to_w = list(dist.keys()) # create a list to convert index to words i_to_w.insert(0, '-PADDING-') i_to_w.insert(2, '<STOP>') w_to_i = {word: idx for idx, word in enumerate(i_to_w) } # dictionary that converts words to their corresponding index X_train_oh = ref2oh( X_train, w_to_i) # convert words in ref sentences into their indexes y_train_oh = ref2oh( y_train, w_to_i) # convert words in ref sentences into their indexes return X_train_oh, y_train_oh, i_to_w, dico_
def getUniqueWords(subredditname): wordfile_path = datadirectory + "/ProcessedData/" + subredditname + "_words" + ".txt" set_of_words = set() freq_subreddit = FreqDist() if not path.exists(wordfile_path): for datafile in getTextFileNames(subredditname): if path.exists(datafile): print("reading " + datafile) freq_subreddit = collectFreqData(datafile) + freq_subreddit else: print("no data for " + datafile) for i in freq_subreddit.most_common(20): print(i) with open(wordfile_path, "a+") as wordfile: for word in freq_subreddit.keys(): word = word.strip() word = word.lower() set_of_words.add(word) wordfile.write(word + "\n") return set_of_words else: with open(wordfile_path, "r") as wordfile: #read line by line print("reading " + wordfile_path) for word in wordfile: word = word.strip() word = word.lower() set_of_words.add(word) return set_of_words
def doNLTK(play): # Initialize NLTK object: toks = word_tokenize(play) full_text = nltk.Text(toks) context = nltk.text.ContextIndex(toks) # Yes, this has similar_words, this is what we need! allwords = [] # print(full_text.concordance('madness')) # No need to print. Returns None, like similar(). # print(full_text.similar('death')) fdist = FreqDist(full_text) # commons = fdist.most_common(250) commons = [f for f in fdist.keys() if fdist[f] > 8] # Can also check it's not a stop word here commons_str = ' '.join(commons) commons_toks = word_tokenize(commons_str) commons_tags = nltk.pos_tag(commons_toks) # Hideous -- figure out regex: commons_imp = [(c[0]) for c in commons_tags if (c[1] == 'NN') or (c[1] == 'NNP') or ('VB' in c[1]) or ('JJ' in c[1])] commons_imp_nostop = [c for c in commons_imp if c.lower() not in stop_words] # print(commons_imp_nostop) for w in commons_imp_nostop: # x = full_text.ContextIndex.similar_words(w) # What? Why does this return None but just print? x = context.similar_words(w) # print(x) for idx, x in enumerate(context.similar_words(w)): if x.lower() not in stop_words and x not in allwords: allwords.append(x.lower()) print('{} is similar to {} by degree {}'.format(w, x.lower(), idx))
def find_abbreviations(): import db from tokenizers import es from nltk import FreqDist corpus = db.connect() #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)]) text = '\n'.join([a['text'] for a in corpus.articles.find()]) tokens = es.tokenize(text, ignore_abbreviations=True) fd = FreqDist() fd_abbr = FreqDist() fd_n_abbr = FreqDist() n_tokens = len(tokens) for i in range(n_tokens): fd.inc(tokens[i]) if i < (n_tokens - 1) and tokens[i + 1] == u'.': fd_abbr.inc(tokens[i]) else: fd_n_abbr.inc(tokens[i]) adjusted = {} f_avg = len(fd.keys()) / fd.N() for t, n in fd_abbr.iteritems(): f = fd.get(t, 0) / fd.N() deviation = 1 + (f - f_avg) adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t) items = adjusted.items() items.sort(key=lambda i: i[1], reverse=True) for t, n in items[:100]: print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
class FrequenceVocabulary: """ Vocabulary that contains words frequency estimated from words count in files specified. """ def __init__(self, miss_f): """ Construct new vocabulary with function that computes word probability for words which absent in vocabulary. Example usage: >>> miss_f = lambda key, N: 10. / (N * 10 ** len(key)) :param miss_f: function for estimating probability of missing words. """ self.vocab = FreqDist() self._miss_f = miss_f def load_vocab(self, root='.', files='.*'): """ Load new vocabulary. :param root: the root directory for the corpus. :param files: A list or regexp specifying the files in this corpus. """ voc = PlaintextCorpusReader(root, files) for word in voc.words(): self.vocab[word.lower()] += 1 def p(self, key): """ :param key: word to compute it's probability :return: A probability distribution computed for key. """ return 1. * self.vocab[key] / self.vocab.N() if key in self.vocab.keys( ) else self._miss_f(key, self.vocab.N())
def resume_skills(input_skills): # Bigrams and trigrams identifier ''.join(input_skills) bigrams_present = [] trigrams_present = [] s1 = [] s1.append(input_skills) for phrase in s1: bigrams_present.extend([" ".join(bi) for bi in ngrams(phrase.lower().split(), 2)]) trigrams_present.extend([" ".join(tri) for tri in ngrams(phrase.lower().split(), 3)]) all_grams = set(bigrams_present).union(set(trigrams_present)) soft_skills_present = soft_skills.intersection(all_grams) # print(soft_skills_present) tokenized_data = word_tokenize(input_skills) tokenized_data = [word for word in tokenized_data if word not in stop_words] tagged = pos_tag(tokenized_data) nouns = [word for word,pos in tagged if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'JJ' or pos == 'VBP')] nouns.extend(list(soft_skills_present)) test_data_freq = FreqDist(nouns) # print(' '.join(test_data_freq.keys())) skills_present = test_data_freq.keys() return skills_present
def count_difficult_items(items, min_length=4, min_freq=2): freq_dist = FreqDist(items) keys = freq_dist.keys() return len([ key for key in keys if len(key) >= min_length and freq_dist[key] <= min_freq ])
def GetAllWords(self, content): ''' get all words appear in content - content: input string - Returns a set of all words ''' rawTokens = nltk.word_tokenize(content) alphabeticalTokens = [w for w in rawTokens if w.isalpha()] del rawTokens lowerTokens = [w.lower() for w in alphabeticalTokens] del alphabeticalTokens stopwords = nltk.corpus.stopwords.words('english') tokens = [w for w in lowerTokens if w not in stopwords] del lowerTokens del stopwords lemmatizer = nltk.WordNetLemmatizer() lemmatizedTokens = [lemmatizer.lemmatize(t) for t in tokens] tokenDist = FreqDist(lemmatizedTokens) allWords = set(tokenDist.keys()) if allWords == None: return set() return allWords
def term_frequency(): for id in doc_id: print("start tf : ", id) # seperate corpus on basis of id text = soup.find(id=id).get_text() # basic pre-processing using clear_text method words = clear_text(text) unigram = get_ngram(words, 1) doc_token = FreqDist(unigram) structure[id] = doc_token.keys() for x in list(unique.keys()): if x in list(doc_token.keys()): tf[id, x] = doc_token[x] / len(doc_token) else: tf[id, x] = 0 return tf, structure
def freq_words(x, terms = 30): all_words = ' '.join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) return words_df
def draw_word2vec(): ### Load data dataloader = csv_dataloader() dataloader.load("output/data_cache.pk") print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300) print "Pretrained word2vec loaded" all_tokens = sum(dataloader.data.viewvalues(), []) print "#Tokens: " + str(len(all_tokens)) fdist = FreqDist(all_tokens) tokens = fdist.keys()[1:500] print tokens tokens_has_vectors = [] for token in tokens: if word2vec[token] is not None: tokens_has_vectors.append(token) print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors)) vectors = word2vec.encode(tokens_has_vectors) print "#Unique Vectors: " + str(len(vectors)) print ("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000) # clf = manifold.Isomap(n_components=2, max_iter=100) vectors_mds = clf.fit_transform(vectors) print ("Done. Stress: %f" % clf.stress_) plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
def append_terms(doc, terms, data, minterm_doc,vector): tf = FreqDist(terms) max_tf = max(tf.values()if len(tf)>0 else [0]) for term in tf.keys(): normalize_tf = tf[term] / max_tf new_doc = {'tf': normalize_tf, 'weight': 0, 'minterm':minterm_doc} in_data = False for term_data in data: if term == term_data['key']: #update term_data['value']['documents'][doc] = new_doc in_data = True break if not in_data: # add data.append({'key': term, 'value': {'idf':0, 'documents': {doc:new_doc}, 'index_in_vector': vector[term]}})
def answer_four(): from nltk import FreqDist dist = FreqDist(text1) vocab = dist.keys() freqwords = sorted([w for w in vocab if len(w) > 5 and dist[w] > 150]) return freqwords
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def top(self, tokens, lowest_rank=50): ''' A list of the most frequent (non-stopword) tokens ''' from operator import itemgetter content = self.words(tokens) fdist = FreqDist(content) vocab = iter(fdist.keys()) # Forget all previous ranking self.lower_words = {} frequency = 0 while frequency < lowest_rank: try: word = vocab.next() except StopIteration: break word_lower = word.lower() if word_lower in self.lower_words: self.lower_words[word_lower] = self.lower_words[word_lower] + fdist[word] else: self.lower_words[word_lower] = fdist[word] frequency = frequency + 1 # return sorted(self.lower_words, key=itemgetter(1), reverse=True) return map(itemgetter(0), sorted(self.lower_words.items(), key=itemgetter(1), reverse=True))
def analysis(dataset, topic_list): ''' start with some data analysis on Review Text and Review Title applying the bag of words approach first ''' # remove stopwords and punctions and symbols dataset['Review Text'] = dataset['Review Text'].str.replace( "[^a-zA-Z#]", " ") # remove short words (length < 3) dataset['Review Text'] = dataset['Review Text'].apply( lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 2])) all_reviews = [ remove_stop_words(words.split(" ")) for words in dataset['Review Text'] ] lemmatizer = WordNetLemmatizer() all_words = ' '.join([lemmatizer.lemmatize(word) for word in all_reviews]).split() ''' Plotting the top 30 words of highest frequency ''' freq_dist = FreqDist(all_words) words_distribution = pd.DataFrame({ 'word': list(freq_dist.keys()), 'count': list(freq_dist.values()) }) top_words_distribution = words_distribution.nlargest( columns='count', n=30) # want to view top 30 words #plot the output plt.figure(figsize=(50, 10)) ax = sns.barplot(data=top_words_distribution, x="word", y="count") ax.set(ylabel='Count') plt.show() return top_words_distribution, dataset
def featureExtraction(sentence): # feature 1 -> tagged input features = {'taggedInput': tagInput(sentence), 'bow': {}} # bag of words # words not to include exclude_words = stopwords.words('english') for c in [".", "?", "!", ","]: exclude_words.append(c) arr_all_words = list( set([w for w in word_tokenize(sentence) if w not in exclude_words])) if os.path.isfile(CONSTANTS.BOW_PATH): for w in joblib.load(CONSTANTS.BOW_PATH): if w not in arr_all_words and w != None: arr_all_words.append(w) joblib.dump(arr_all_words, CONSTANTS.BOW_PATH) else: joblib.dump(arr_all_words, CONSTANTS.BOW_PATH) all_words = FreqDist(w.lower() for w in arr_all_words) word_features = all_words.keys()[:2000] document_words = set(word_tokenize(sentence)) bow = {} for word in word_features: bow['contains(%s)' % word] = (word in document_words) features['bow'] = bow # add other features here... v return bow
def build_distribution_matrix(self, stems): distrib_matrix_filename = '{0}_distrib_matrix.txt'.format(self.db_name) if os.path.isfile(distrib_matrix_filename): # load matrix from file self.log( 'Loading existing distribution matrix from {0}'.format( distrib_matrix_filename), logging.INFO) distrib_matrix = dict() with open(distrib_matrix_filename, 'rt') as f: csvrreader = csv.DictReader(f, delimiter=' ', lineterminator=self.linesep) for row in csvrreader: distrib_matrix.update({row['w']: row['P(w|M)']}) f.close() else: # create matrix and save file self.log( 'Creating new distribution matrix into {0}. Please wait, this may take some time' .format(distrib_matrix_filename), logging.INFO) distrib_matrix = FreqDist(stems) with open(distrib_matrix_filename, 'wt') as f: writer = csv.DictWriter(f, fieldnames=['w', 'P(w|M)'], delimiter=' ', lineterminator=self.linesep) writer.writeheader() for k in distrib_matrix.keys(): writer.writerow({'w': k, 'P(w|M)': distrib_matrix[k]}) f.close() distrib_matrix = Discretizer.reduce_distribution_matrix(distrib_matrix, cutoff=1) return distrib_matrix
class BrownDataset(object): def __init__(self, include_start=True): self.words = brown.words() self.words = map(lambda x: x.lower(), self.words) self.total_word_cnt = len( self.words) + 2 * len(brown.sents()) # include START and END if include_start: self.words.append(u'START') self.words.append(u'END') self.vocab = set(self.words) self.vocab_len = len(self.vocab) self.word_to_idx = dict(zip(list(self.vocab), range(self.vocab_len))) self.sentences = [] self.bigrams = [] self.unigrams = [] for sent in brown.sents(): sentence = map(lambda x: x.lower(), sent) if include_start: sentence.insert(0, u'START') sentence.append(u'END') self.sentences.append(sentence) self.bigrams.extend(list(ngrams(sentence, 2))) self.unigrams.extend(sentence) self.unigram_freq = dict(Counter(self.unigrams)) self.num_sentences = len(self.sentences) self.bigram_cnt = FreqDist(self.bigrams) self.bigram_len = len(self.bigram_cnt) self.bigram_idx = dict( zip(self.bigram_cnt.keys(), range(self.bigram_len))) self.bigram_freq = np.asarray(self.bigram_cnt.values()) self.num_bigrams = len(self.bigram_cnt)
def bag_of_words(data, label_codebook, feature_codebook, theta): """""" word_dict = Alphabet() stopset = set(stopwords.words('english')) for key, value in data.items(): label_codebook.add(key) for doc in value: doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+")) for word in doc_tokens: if word not in stopset: word_dict.add(word) all_words = word_dict._label_to_index.keys() fdict = FreqDist([w for w in all_words]) word_feature = fdict.keys()[theta:] for word in all_words: if word in word_feature: feature_codebook.add(word) instance_list = {} for label, document_list in data.items(): instance_list[label] = [] for document in document_list: vector = np.zeros(feature_codebook.size()) tokens = set(nltk.regexp_tokenize(document, pattern="\w+")) indice = 0 for word in tokens: if feature_codebook.has_label(word): indice = feature_codebook.get_index(word) vector[indice] = 1. instance_list[label].append(vector) return instance_list
def bigram(DATA): bigramWordBook = [] bigramDict = {} for comment in DATA: if 'text' in comment: # stop_words = set(stopwords.words('english')) # clean_text = [w for w in comment["text"] if not w in stop_words] # bigrams = ngrams(clean_text, 2) bigrams = ngrams(comment["word"], 2) bigramDist = FreqDist(bigrams) bigramWordBook.append(bigramDist) for key in bigramDist.keys(): if key in bigramDict.keys(): bigramDict[key] = bigramDict[key] + bigramDist[key] else: bigramDict[key] = bigramDist[key] bigramDict = { key: value for key, value in bigramDict.items() if value > 100 } # print(len(bigramDict)) for key in bigramDict: bigramDict[key] = 0 for comment, bigrams in zip(DATA, bigramWordBook): Copy_BiWB = copy.deepcopy(bigramDict) for key in bigrams: if key in Copy_BiWB: Copy_BiWB[key] += 1 comment['BigramWordBook'] = Copy_BiWB return DATA
def main(): userInput = parser.getInput() fileList = parser.getFiles(userInput['train']) pdata = parser.parseFiles(fileList) allsent = '' for f in pdata: allsent += f[3] all_words = FreqDist(w.lower() for w in word_tokenize(allsent) if w not in stopwords.words('english') ) global top_words top_words = all_words.keys()[:500] # pdata = getParseData() featdata = featureAggregator(pdata) print featdata[:10]
def plot_postives(s=0, e=50): all_words = ' '.join([text for text in df['review']]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n=len(df['review'])) d.reset_index(inplace=True) d['pos_perc'] = np.nan for tag in d['word'].values: ret = df[df['review'].str.contains(tag)] pos_perc = ret[ret['prediction'] == 'pos'].shape[0] / ret.shape[0] * 100 neg_perc = 100 - pos_perc d.loc[(d['word'] == tag), 'pos_perc'] = pos_perc d = d.sort_values('pos_perc', ascending=False) plt.figure(figsize=(20, 5)) sns.barplot(data=d[s:e], x='word', y='pos_perc') if (e - s > 60): plt.xticks(rotation=90) else: plt.xticks(rotation=45) plt.xticks() plt.title('Percentage of Positive Reviews per tag.') plt.show()
def getLongTermsRanked(self, minLen=7.0, numberMostCommons=30, display=False): result = [] resultDocuments = {} for seoDocument in self.seoLibrary.seoDocuments: tokenList = list( set( seoDocument.getTextTokens(removeSplitter=True, lemmatize=True))) for token in tokenList: if len(token) > minLen: result.append(token) if token not in resultDocuments: resultDocuments[token] = [seoDocument.order] else: resultDocuments[token].append(seoDocument.order) fdist = FreqDist(result) for token in fdist.keys(): fdist[token] = fdist[token] * self.getRankingModifier( numpy.mean(resultDocuments[token])) * self.getLengthModifier( len(token), minLen) maxValue = max(fdist.values()) return [(word, int(metric * 100.00 / maxValue)) for word, metric in fdist.most_common(numberMostCommons)]
def train_finder(self, all_listings): """ Train the product identification algorithm with example data. """ logging.info("Start training of recognizer for product: {0}" .format(self.product_id)) self.classifier = None #select example listings for the finder's product listings, n_pos, n_neg = self.filter_trainig_samples(all_listings) logging.info("Number listings: {l}, positive: {p}, negative: {n}; " "features: {f}" .format(l=len(listings), p=n_pos, n=n_neg, f=self.n_features)) if len(listings) < 30: logging.warn("Product {0}. Can't compute classifier. " "Too few listings." .format(self.product_id)) return elif n_pos < 10: logging.warn("Product {0}. Can't compute classifier. " "Too few positive listings." .format(self.product_id)) return elif n_neg < 10: logging.warn("Product {0}. Can't compute classifier. " "Too few negative listings." .format(self.product_id)) return #Create list of most common words, and put it into feature extractor #TODO: remove stop-words self.feature_extractor = FeatureExtractor() word_freqs = FreqDist() for _, listing in listings.iterrows(): words = self.feature_extractor.extract_words(listing) word_freqs.update(words) common_words = word_freqs.keys()[:self.n_features] self.feature_extractor = FeatureExtractor(common_words) logging.debug("Number individual words: {0}; hapaxes: {1}" .format(len(word_freqs), len(word_freqs.hapaxes()))) logging.debug("Most common words: {}".format(word_freqs.keys()[:100])) #Train the classifier train_set = self.create_labeled_features(listings) self.classifier = nltk.NaiveBayesClassifier.train(train_set) self.classifier.show_most_informative_features(20)
def understand_text(self, source): output = open( "Analytics_for_" + source + '_{:%Y_%m_%d_%H%M%S}.txt'.format(datetime.datetime.now()), "w") main = self.combine_articles_from_source(source) puncts = list(string.punctuation) article_tokens = word_tokenize(main) clean_tokens = [] stop_words = set(stopwords.words("english")) # Remove punctuation and stop words for token in article_tokens: if token not in puncts and token not in stop_words and token != "'s" and token != "``" and token != "''": clean_tokens.append(token) print("************ANALYSING************") print(main) output.write( "#########################################################") output.write("#Analysis of all cached posts by " + source + " #") output.write( "#########################################################") output.write( "# Concatenated text: #") output.write( "#########################################################") output.write(main.encode('utf-8', 'ignore')) output.write( "#########################################################") output.write("\n\n") print("*********************************") output.write( "############Detected tokens:#############################\n\n") fdist = FreqDist(clean_tokens) print("*************STATS:*****************") print("Detected words: ") words = "" for key in fdist.keys(): words += key + ", " print(words) output.write(words.encode('utf-8', 'ignore') + "\n") output.write( "\n\n#######################Top 25 words:#####################\n\n" ) print("\n\n***25 Most common***:") for common in fdist.most_common(n=25): print("\"" + common[0] + "\"" + " occurances " + str(common[1])) output.write("\"" + common[0].encode('utf-8', 'ignore') + "\"" + " occurances " + str(common[1]) + "\n") output.write( "######################COMPLETE############################") output.close() text = Text(clean_tokens) # text.plot(25) print("************/STATS*****************")
def load_book_features(file_name): with open(file_name, 'r') as file_handler: text = file_handler.read() morph = pymorphy2.MorphAnalyzer() sentence_list = sent_tokenize(text) usual_book_words = [] sentences_length_dist = [] words_length_dist = [] pron_dist = [] conj_dist = [] for sentence in sentence_list: if sentence != ".": pron_count = 0 conj_count = 0 sentence_words = re.findall(r"[\w]+", sentence) sentences_length_dist.append(len(sentence_words)) for word in sentence_words: words_length_dist.append(len(word)) if word in NOMINATIVE_PRONOUNS: pron_count += 1 if morph.parse(word)[0].tag.POS == 'CONJ': conj_count += 1 if word not in STOPWORDS: usual_book_words.append(word) conj_dist.append(conj_count) pron_dist.append(pron_count) sentence_length_freq_dist = FreqDist(sentences_length_dist) sentences_length_dist = [sentence_length_freq_dist.freq(i) for i in range(1, RANGE + 1)] sentences_length_dist.append(1 - sum(sentences_length_dist)) words_length_freq_dist = FreqDist(words_length_dist) words_length_dist = [words_length_freq_dist.freq(i) for i in range(1, RANGE + 1)] words_length_dist.append(1 - sum(words_length_dist)) pron_freq_dist = FreqDist(pron_dist) pron_dist = [pron_freq_dist.freq(i) for i in range(0, RANGE + 1)] pron_dist.append(1 - sum(pron_dist)) conj_freq_dist = FreqDist(conj_dist) conj_dist = [conj_freq_dist.freq(i) for i in range(0, RANGE + 1)] conj_dist.append(1 - sum(conj_dist)) words_freq_dist = FreqDist(usual_book_words) num_unique_words = len(words_freq_dist.keys()) num_total_words = len(usual_book_words) hapax = len(words_freq_dist.hapaxes()) / num_unique_words dis = len([item for item in words_freq_dist if words_freq_dist[item] == 2]) / num_unique_words richness = num_unique_words / num_total_words return [hapax, dis, richness, *sentences_length_dist, *words_length_dist, *pron_dist, *conj_dist]
def get_word_features(self, wordlist): """ wordlist - List of words. Word can be the same This function takes distinct words in list and return it in a list """ wordlist = FreqDist(wordlist) word_features = wordlist.keys() return word_features
def get_top_n_words(n, category=''): #return the most frequent n words from a category (or the entire corpus) if category=='': text=brown.words() # get the text from the entire corpus else: text=brown.words(categories=category) # get the text from the given category fdist=FreqDist(text) top_words=fdist.keys() return top_words[:n]
def FreqDisk(self): fd = file('full_title_set', 'r') title_set = pickle.load(fd) fd.close() fdist = FreqDist(title_set) print "===>best 100", repr(fdist.keys()[:20]) print "==========================="
def docTF_over_corpusTF(dirPath, lang, searchTerm): ''' Returns a dictionary with tokens as keys and relative frequencies as values. Given a search term, 'token', the function groups all tweets with that token together to form a document. Then it returns the log of the frequency of tokens in that document over frequency of those tokens outside the document. The logic is meant to be similar to tf-idf. ''' searchTerm = codecs.decode(searchTerm, 'utf-8') docTokens=[] corpusTokens=[] for tweetFile in os.listdir(dirPath): # make sure tweetFile is a file, not a dir if os.path.isfile(dirPath+tweetFile): # using codecs for encoding issues, not sure if needed rawFile = codecs.open(dirPath + tweetFile, 'r', 'utf-8') # my tweet files have one tweet per line for rawTweet in rawFile: try: # just look at one language if rawTweet.split('\t')[4] == lang: tweetText = rawTweet.split('\t')[1].lower() tokens = tokenize(tweetText) # look for the search term in the tweet text if re.compile(searchTerm).search(tweetText): docTokens.append(tokens) else: corpusTokens.append(tokens) # issues with windows ^M newline except: pass # make lists of vocab for each set docVocab = [token for doc in docTokens for token in doc] corpusVocab = [token for doc in corpusTokens for token in doc] # make frequency distributions with nltk, excluding hapaxes from document docFD = {key:value for key,value in FreqDist(docVocab).items() if value > 1} corpusFD = FreqDist(corpusVocab) # calculate relative frequency for each token docOverCorpusTF={} for key in docFD.keys(): if key in corpusFD.keys(): docOverCorpusTF[key] = log(docFD[key]/corpusFD[key]) else: docOverCorpusTF[key] = log(docFD[key]/1) f = codecs.open('results.txt', 'w', 'utf-8') f.write(str(len(docTokens)) + " tweets were found *with* the search term: " + searchTerm +"\n") f.write(str(len(corpusTokens)) + " tweets were found *without* the search term: " + searchTerm +"\n") for item in sorted(docOverCorpusTF, key=docOverCorpusTF.get): f.write("%s\n" % item) f.close()
def entity_helper(catname, catnum): document = load_files(str(catnum)) print "Extracting Chunks" chunks = chunk_document(document) print "Extracting Entities" entities = extract_entities(chunks) fdist = FreqDist(entities) print "5 most common entities ({0})".format(catname) for i in fdist.keys()[:10]: print i
def similar(self,context, word, num=20): word = word.lower() wci = context._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] del fd return words else: return []
def end_reuters(self): """Write out the contents to a file and reset all variables.""" from textwrap import fill import re import string filename = "/dev/null" if self.reuters_lewis_split == "TRAIN" and self.reuters_topics == "YES": directory = "C:\\Users\\JeffT\\University Work\\phd\\corpora\\reuters-21578\\" + category filename = self.doc_id elif self.reuters_lewis_split == "TEST" and self.reuters_topics == "YES": directory = "C:\\Users\\JeffT\\University Work\\phd\\corpora\\reuters-21578\\" + category filename = self.doc_id elif self.reuters_lewis_split == "NOT-USED" and (self.reuters_topics == "YES" or self.reuters_topics == "NO" or self.reuters_topics == "BYPASS"): filename = "junk" if filename != "junk" and filename != "/dev/null": if category in self.topics: fullfilepath = directory + "\\" + filename sys.stdout.write(fullfilepath + "\n") doc_file = open(fullfilepath, "w") # we're only interested in the title and body, so just combine them all_content = self.title + self.body # convert to lowercase all_content = all_content.lower() #remove everything except letters and spaces all_content = re.sub("[^a-z ]", " ", all_content) #strip out multiple spaces all_content = re.sub(r'\s+', r' ', all_content) # make the string into a list and remove stopwords from it all_content_split = all_content.split() all_content_no_stopwords = remove_stopwords(all_content_split) fd = FreqDist(all_content_no_stopwords) doc_file.write(" ".join(fd.keys())) doc_file.close() # # Reset variables self.in_topics = 0 self.in_title = 0 self.in_body = 0 self.reuters_lewis_split = "" self.reuters_topics = "" self.doc_id = 0 self.topics = [] self.title = "" self.body = ""
def main(): userInput = parser.getInput() fileList = parser.getFiles(userInput['train']) parsedata = parser.parseFiles(fileList) allsent = '' for f in parsedata: allsent += f[3] all_words = FreqDist(w.lower() for w in word_tokenize(allsent) if w not in stopwords.words('english') ) global top_words top_words = all_words.keys()[:500] featdata = extractor.featureAggregator(parsedata) # print featdata[20] print "Sample Data Item:\n\n" print "%20s %4s %4s %20s" % ("FILENAME", "LINENUM", "VOTE", "SENTENCE" ) print "-" * 79 print "%10s %4s %4s %20s" % (featdata[20][0], featdata[20][1], featdata[20][2], featdata[20][3]) print "\n\nFeatures of this Data Item" print "-" * 79 for key,val in featdata[20][4].items(): print "%50s : %10s" % (key, val ) # print "A sample feature: %s" % (featdata[20][4]) allacc = splitfeatdata(featdata) print "\n\n" print "-" * 60 print "Accuracy Values: %s" % (allacc) print "==" * 60 print "Overall Classifier Accuracy %4.4f " % (sum(allacc)/len(allacc))
def nltk_test_1(): fd = FreqDist() # for each token in the relevant text, increment its counter for word in gutenberg.words('austen-persuasion.txt'): fd[word.lower()] += 1 print fd.N() # total number of samples print fd.B() # number of bins or unique samples # Get a list of the top 10 words sorted by frequency l = [] for word in fd.keys(): tp = (word, fd[word]) l.append(tp) l.sort(key = lambda x : x[1], reverse=True) for itr in l[:10]: print itr[0], itr[1]
def feature_selection_freq(self,instance_list,limits): """get the 2000 most frequent words""" #to store all the tokens all_words = [] #populates the codebook for i in instance_list: if self.label_codebook.has_label(i.label) == False: self.label_codebook.add(i.label) #here we do some feature selection work by filtering the stopwords defined by NLTK. all_words += i.raw_data #select the 'limit' most frequent words as feature fdict = FreqDist([w for w in all_words]) word_feature = fdict.keys()[:limits] for wd in word_feature: self.feature_codebook.add(wd)
def transfer(fileDj,vocabulary): fo=open(fileDj,"r") content=fo.read() tokens=nltk.word_tokenize(content) # st=[SBStemmer.stem(t) for t in tokens] st=tokens fo.close() fdist=FreqDist(st) BOWDj = [] for key in vocabulary: if key in fdist.keys(): BOWDj.append(fdist.get(key)) else: BOWDj.append(0) return BOWDj
def accuracy(classifier, calc): print("====== ESTIMATING CLASSIFIER ACCURACY ======") ast = Normalizer.normalize(astronomic_test) rel = Normalizer.normalize(religion_test) cou = Normalizer.normalize(countries_test) print("Test set generation") test_set = [(x, "astronomy") for x in ast] + [(x, "religion") for x in rel] + [(x, "country") for x in cou] del ast del rel del cou vocabulary = FreqDist(chain(*[n for n, tag in test_set])) vocabulary = list(vocabulary.keys())[:100] feature_set = [({i: (i in sentence) for i in vocabulary}, tag) for sentence, tag in test_set] print("Trained classifier estimated accuracy:", classify.accuracy(classifier, feature_set)) if (calc): calculate(classifier, feature_set)
def sort_tfidf(infilename, outfilename): inputfile = open(infilename) outputfile = open(outfilename, 'w') freqdist = FreqDist() for line in inputfile: line = line.strip() words = line.split("\t") freqdist[words[0]] = float(words[1]) for word in freqdist.keys(): tmp = word + "\t" + str(freqdist[word]) + "\n" outputfile.write(tmp) inputfile.close() outputfile.close()
def get_word_features(lines): """ Create a reference word feature """ wordlist = [] for line in lines: wordlist += word_tokenize(line) #remove stopwords wordlist = [w for w in wordlist if w not in stopwords.words('english')] #remove proper nouns taglist = pos_tag(wordlist) wordlist = [w for (w, tag) in taglist if tag != "NP" ] wordlist = FreqDist(wordlist) word_features = wordlist.keys() return word_features
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)