def calc_hapax_index(text: List[str]) -> float: """ Вычисление Гапакс-индекса Описание: Гапакс - слово, встретившееся в тексте только один раз Гапаксы того или иного автора нередко используют для атрибуции ему некоторого другого произведения, где встречаются такие слова Ссылки: https://ru.wikipedia.org/wiki/Гапакс https://en.wikipedia.org/wiki/Hapax_legomenon Аргументы: text (list[str]): Список слов Вывод: float: Значение индекса """ n_words = len(text) n_lexemes = len(set(text)) num = 100 * log10(n_words) freqs = FreqDist(text) hapaxes = len(freqs.hapaxes()) den = 1 - (safe_divide(hapaxes, n_lexemes)) hapax_index = safe_divide(num, den) return hapax_index
def hapaxes(entrada): ''' Hapaxes são palavras que ocorrem apenas uma vez no texto. Essa função é responsável por retornar uma lista dessas palavras. ''' fdist = FreqDist(entrada) return fdist.hapaxes()
def load_book_features(file_name): with open(file_name, 'r') as file_handler: text = file_handler.read() morph = pymorphy2.MorphAnalyzer() sentence_list = sent_tokenize(text) usual_book_words = [] sentences_length_dist = [] words_length_dist = [] pron_dist = [] conj_dist = [] for sentence in sentence_list: if sentence != ".": pron_count = 0 conj_count = 0 sentence_words = re.findall(r"[\w]+", sentence) sentences_length_dist.append(len(sentence_words)) for word in sentence_words: words_length_dist.append(len(word)) if word in NOMINATIVE_PRONOUNS: pron_count += 1 if morph.parse(word)[0].tag.POS == 'CONJ': conj_count += 1 if word not in STOPWORDS: usual_book_words.append(word) conj_dist.append(conj_count) pron_dist.append(pron_count) sentence_length_freq_dist = FreqDist(sentences_length_dist) sentences_length_dist = [sentence_length_freq_dist.freq(i) for i in range(1, RANGE + 1)] sentences_length_dist.append(1 - sum(sentences_length_dist)) words_length_freq_dist = FreqDist(words_length_dist) words_length_dist = [words_length_freq_dist.freq(i) for i in range(1, RANGE + 1)] words_length_dist.append(1 - sum(words_length_dist)) pron_freq_dist = FreqDist(pron_dist) pron_dist = [pron_freq_dist.freq(i) for i in range(0, RANGE + 1)] pron_dist.append(1 - sum(pron_dist)) conj_freq_dist = FreqDist(conj_dist) conj_dist = [conj_freq_dist.freq(i) for i in range(0, RANGE + 1)] conj_dist.append(1 - sum(conj_dist)) words_freq_dist = FreqDist(usual_book_words) num_unique_words = len(words_freq_dist.keys()) num_total_words = len(usual_book_words) hapax = len(words_freq_dist.hapaxes()) / num_unique_words dis = len([item for item in words_freq_dist if words_freq_dist[item] == 2]) / num_unique_words richness = num_unique_words / num_total_words return [hapax, dis, richness, *sentences_length_dist, *words_length_dist, *pron_dist, *conj_dist]
def hapaxes(self, words=False, filtrate=False): '''Метод извлекающий из текста слова-одиночки''' if not words: # ищем в леммах res = self._vocab else: res = FreqDist(self.words(filtrate=filtrate)) return res.hapaxes()
def replaceUnique(self): """ Replaces unique words with the UNK label """ word_frequencies = FreqDist([word for (word, _) in self.tagged_sents]) self.lexicon_size = len(word_frequencies) hap = set(word_frequencies.hapaxes()) res = [(UNK, tag) if word in hap else (word, tag) for (word, tag) in self.tagged_sents] self.tagged_sents = res
def processText(text): print("Processing...") text = '<s> ' + text text = text.replace('\r\n', ' </s>\r\n<s> ') text = ' </s>\r\n'.join(text.rsplit(' </s>\r\n<s> ', 1)) textTkns = nltk.word_tokenize(text) textFD = FreqDist(textTkns) textSingles = textFD.hapaxes() for word in textSingles: text = text.replace(" " + word + " ", " <UNK> ") print("Done") return text
def make_vocabs(normalized_data, is_pickle=True): """ Fungsi untuk melakukan pembuatan bow/vocabulary Proses pembuatan vocabs vocabs ini digunakan untuk membentuk feature vector dari normalized data beberapa perlakukan untuk membentuk vocabs, di antarnya: (1) hapus hapax: kata yang hanya muncul sekali dari seluruh corpus (2) seleksi hanya kata kerja (3) hapus hapax dan gunakan hanya kata dengan panjang > 2 karakter return: all_words = vocabs/bow hasil paramater: normalized_data = data text yang sudah dilakukan preprocessing/normalisasi """ all_words = [ word for sentence in normalized_data for word in sentence.split() ] fd = FreqDist(all_words) # sebelum di-set, bentuk object freqdist all_words = list(sorted(set(all_words))) print('n fitur awal:\t\t', len(all_words)) # (1) hapaxes = fd.hapaxes() # all_words = [word for word in all_words if word not in hapaxes] # (2) # with open('../experiment/pos_tag_indo.pkl', 'rb') as file: # jj = pickle.load(file) # all_words_adj = [word for word in all_words if word in jj] # all_words = all_words_adj all_words = [ word for word in all_words if len(word) > 2 and word not in hapaxes ] file_path = os.getcwd() + '/data/dinamics/vocabs.pkl' if is_pickle: with open(file_path, 'wb') as data: pickle.dump(all_words, data) return all_words
def preprocess(train_data): all_words = [] for article in train_text_data: article_words = list(set(article.split(','))) for word in article_words: all_words.append(word) dist = FreqDist(all_words) least_frequent_words = dist.hapaxes() for word in least_frequent_words: if(word in dist): del dist[word] vocab = set(dist.keys()) return vocab
def train_finder(self, all_listings): """ Train the product identification algorithm with example data. """ logging.info("Start training of recognizer for product: {0}" .format(self.product_id)) self.classifier = None #select example listings for the finder's product listings, n_pos, n_neg = self.filter_trainig_samples(all_listings) logging.info("Number listings: {l}, positive: {p}, negative: {n}; " "features: {f}" .format(l=len(listings), p=n_pos, n=n_neg, f=self.n_features)) if len(listings) < 30: logging.warn("Product {0}. Can't compute classifier. " "Too few listings." .format(self.product_id)) return elif n_pos < 10: logging.warn("Product {0}. Can't compute classifier. " "Too few positive listings." .format(self.product_id)) return elif n_neg < 10: logging.warn("Product {0}. Can't compute classifier. " "Too few negative listings." .format(self.product_id)) return #Create list of most common words, and put it into feature extractor #TODO: remove stop-words self.feature_extractor = FeatureExtractor() word_freqs = FreqDist() for _, listing in listings.iterrows(): words = self.feature_extractor.extract_words(listing) word_freqs.update(words) common_words = word_freqs.keys()[:self.n_features] self.feature_extractor = FeatureExtractor(common_words) logging.debug("Number individual words: {0}; hapaxes: {1}" .format(len(word_freqs), len(word_freqs.hapaxes()))) logging.debug("Most common words: {}".format(word_freqs.keys()[:100])) #Train the classifier train_set = self.create_labeled_features(listings) self.classifier = nltk.NaiveBayesClassifier.train(train_set) self.classifier.show_most_informative_features(20)
def getUniqueWords(): raw_text_all = '' for csv in allCsvs: filename = 'csvs/' + csv + '.csv' df = pd.read_csv(filename, index_col=0) for line in df['Lines']: raw_text_all += line + '\n' tokens = word_tokenize(raw_text_all) text = nltk.Text(tokens) # text.collocations() fdist = FreqDist(text) unique = fdist.hapaxes() sort_unique = sorted(unique) print(sort_unique)
def get_news_features(headline, text): nlp = es_core_news_md.load() ## headline ## headline = re.sub(r"http\S+", "", headline) headline = re.sub(r"http", "", headline) headline = re.sub(r"@\S+", "", headline) headline = re.sub("\n", " ", headline) headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline) headline = headline.replace(r"*NUMBER*", "número") headline = headline.replace(r"*PHONE*", "número") headline = headline.replace(r"*EMAIL*", "email") headline = headline.replace(r"*URL*", "url") headline_lower = headline.lower() doc_h = nlp(headline_lower) list_tokens_h = [] list_tags_h = [] for sentence_h in doc_h.sents: for token in sentence_h: list_tokens_h.append(token.text) fdist_h = FreqDist(list_tokens_h) syllables_h = get_nsyllables(headline) words_h = len(list_tokens_h) # headline complexity features avg_word_size_h = round( sum(len(word) for word in list_tokens_h) / words_h, 2) avg_syllables_word_h = round(syllables_h / words_h, 2) unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2) mltd_h = round(ld.mtld(list_tokens_h), 2) ttr_h = round(ld.ttr(list_tokens_h) * 100, 2) ## text content## text = re.sub(r"http\S+", "", text) text = re.sub(r"http", "", text) text = re.sub("\n", " ", text) text = text.replace(r"*NUMBER*", "número") text = text.replace(r"*PHONE*", "número") text = text.replace(r"*EMAIL*", "email") text = text.replace(r"*URL*", "url") # to later calculate upper case letters ratio alph = list(filter(str.isalpha, text)) text_lower = text.lower() doc = nlp(text_lower) list_tokens = [] list_pos = [] list_tag = [] list_entities = [] sents = 0 for entity in doc.ents: list_entities.append(entity.label_) for sentence in doc.sents: sents += 1 for token in sentence: list_tokens.append(token.text) list_pos.append(token.pos_) list_tag.append(token.tag_) # Calculate entities, pos, tag, freq, syllables, words and quotes entities = len(list_entities) n_pos = nltk.Counter(list_pos) n_tag = nltk.Counter(list_tag) fdist = FreqDist(list_tokens) syllables = get_nsyllables(text) words = len(list_tokens) quotes = n_tag['PUNCT__PunctType=Quot'] # complexity features avg_word_sentence = round(words / sents, 2) avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2) avg_syllables_word = round(syllables / words, 2) unique_words = round((len(fdist.hapaxes()) / words) * 100, 2) ttr = round(ld.ttr(list_tokens) * 100, 2) # readability spanish test huerta_score = round( 206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2) szigriszt_score = round( 206.835 - ((62.3 * syllables) / words) - (words / sents), 2) # stylometric features mltd = round(ld.mtld(list_tokens), 2) upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2) entity_ratio = round((entities / words) * 100, 2) quotes_ratio = round((quotes / words) * 100, 2) propn_ratio = round((n_pos['PROPN'] / words) * 100, 2) noun_ratio = round((n_pos['NOUN'] / words) * 100, 2) pron_ratio = round((n_pos['PRON'] / words) * 100, 2) adp_ratio = round((n_pos['ADP'] / words) * 100, 2) det_ratio = round((n_pos['DET'] / words) * 100, 2) punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2) verb_ratio = round((n_pos['VERB'] / words) * 100, 2) adv_ratio = round((n_pos['ADV'] / words) * 100, 2) sym_ratio = round((n_tag['SYM'] / words) * 100, 2) # create df_features df_features = pd.DataFrame({ 'text': text_lower, 'headline': headline_lower, 'words_h': words_h, 'word_size_h': [avg_word_size_h], 'avg_syllables_word_h': [avg_syllables_word_h], 'unique_words_h': [unique_words_h], 'ttr_h': ttr_h, 'mltd_h': [mltd_h], 'sents': sents, 'words': words, 'avg_words_sent': [avg_word_sentence], 'avg_word_size': [avg_word_size], 'avg_syllables_word': avg_syllables_word, 'unique_words': [unique_words], 'ttr': [ttr], 'huerta_score': [huerta_score], 'szigriszt_score': [szigriszt_score], 'mltd': [mltd], 'upper_case_ratio': [upper_case_ratio], 'entity_ratio': [entity_ratio], 'quotes': quotes, 'quotes_ratio': [quotes_ratio], 'propn_ratio': [propn_ratio], 'noun_ratio': [noun_ratio], 'pron_ratio': [pron_ratio], 'adp_ratio': [adp_ratio], 'det_ratio': [det_ratio], 'punct_ratio': [punct_ratio], 'verb_ratio': [verb_ratio], 'adv_ratio': [adv_ratio], 'sym_ratio': [sym_ratio] }) return df_features
# print('text_stemmed\n', text_stemmed) # print() # print('texts_normalized', texts_normalized) # print() all_words = [ word for sentence in texts_normalized for word in sentence.split() ] from nltk import FreqDist fd = FreqDist(all_words) # sebelum di-set, bentuk object freqdist all_words = list(sorted(set(all_words))) hapaxes = fd.hapaxes() # print('hapaxes', hapaxes) all_words = [ word for word in all_words if len(word) > 2 and word not in hapaxes ] # print('features:') # print(len(all_words), all_words) from vectorizers import binary_vectorizer, count_vectorizer, tfidf_vectorizer # biner = binary_vectorizer(texts_normalized, all_words) # count = count_vectorizer(texts_normalized, all_words) # tfidf = tfidf_vectorizer(texts_normalized, all_words)
for word in gutenberg.words('austen-persuasion.txt'): fd[word] += 1 ''' ''' fd = FreqDist(gutenberg.words('austen-persuasion.txt')) print fd.N() print fd.B() for word in sorted(fd.keys()): print word,fd[word] ''' # text1.concordance("monstrous") # text4.dispersion_plot(["citizens","democracy","freedom","duties","America"]) # text3.generate() nltk3不支持 # print len(text3) / len(set(text3)) def lexical_diversity(text): return len(text) / len(set(text)) def percentage(count, total): return 100 * count / total fdist1 = FreqDist(text1) print(fdist1) print fdist1.most_common(50) # dist1.plot(50,cumulative=True) print len(fdist1.hapaxes())
return 100 * count / total lexical_diversity(text4) percentage(text4.count('a'), len(text4)) # Simple statistics from nltk import FreqDist # Counting Words Appearing in a Text (a frequency distribution) fdist1 = FreqDist(text4) fdist1 vocabulary1 = fdist1.keys() # list of all the distinct types in the text vocabulary1[:3] # look at first 3 #words that occur only once, called hapaxes fdist1.hapaxes()[:20] # Words that meet a condition, are long for example V = set(text4) long_words = [w for w in V if len(w) > 15] sorted(long_words) #finding words that characterize a text, relatively long, and occur frequently fdist = FreqDist(text4) sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7]) # Collocations and Bigrams. # A collocation is a sequence of words that occur together unusually often. # Built in collocations function text4.collocations()
#print(books.fileids()) text1 = books.words(fileids=['essays_and_wisdom_of_the_ancients.txt']) text2 = books.words(fileids=['new_atlantis.txt']) text3 = books.words(fileids=['novum_organum.txt']) text4 = books.words(fileids=['of_gardens.txt']) text5 = books.words(fileids=['shakespeare.txt']) text6 = books.words(fileids=['the_advancement_of_learning.txt']) fdist1 = FreqDist(text1) fdist2 = FreqDist(text2) fdist3 = FreqDist(text3) fdist4 = FreqDist(text4) fdist5 = FreqDist(text5) fdist6 = FreqDist(text6) shakespeare_list = fdist5.hapaxes() bacon_list1 = fdist1.hapaxes() bacon_list2 = fdist2.hapaxes() bacon_list3 = fdist3.hapaxes() bacon_list4 = fdist4.hapaxes() bacon_list6 = fdist6.hapaxes() bacon_set = set(bacon_list1 + bacon_list2 + bacon_list3 + bacon_list4 + bacon_list6) bacon_list = [] for i in bacon_set: bacon_list.append(i) shake_numless = [] bacon_numless = []
print(stopwords) nostopwords = [word for word in allTokens if word not in stopwords] # NLTK.Text text = nltk.Text(nostopwords) # Collocations (words that frequently appear together) colos = text.collocations() print(colos) # count print(text.count('inlet')) # words in similar contexts print(text.similar('ship')) text.dispersion_plot(['north', 'south', 'east', 'west']) text.dispersion_plot(['ship', 'dock', 'boat', 'canoe', 'steamboat']) # Frequency distributions! from nltk import FreqDist fdist = FreqDist(text) print(fdist.hapaxes()) # words that occur only once print(fdist.most_common(50)) fdist.plot(30)
return len(set(text)) / len(text) # List nltk.book.sent1 # Frequency Distributions from nltk import FreqDist fdist1 = FreqDist(text1) # Como generar descriptores del texto. Las palabras más repetidas # Frequency Distribution Plot cumulative. fdist1.plot(50, cumulative=True) # Las palabras menos repetidas fdist1.hapaxes() # Long Words using len(w) # "the set of all w such that w is an element of V (the vocabulary) and w has property P". V = text1 long_words = [w for w in V if len(w) > 15] sorted(long_words) # Long words and frecuenty words to typifing text fdist5 = FreqDist(nltk.book.text5) sorted(w for w in set(nltk.book.text5) if len(w) > 7 and fdist5[w] > 7) # Bigrams from nltk.util import bigrams list(bigrams(['more', 'is', 'said', 'than', 'done']))
def part1(): # Files: input (read as a single string mode) and output files(write mode) inf = open("microblog2011.txt").read() outa = open("microblog2011_tokenized.txt", 'w') outb = open("Tokens.txt", 'w') # Initializing Tweet Tokenizer and writing it in the output file. tknzr = TweetTokenizer() a = tknzr.tokenize(inf) outa.writelines(str(a)) # How many tokens did you find in the corpus? How many types (unique tokens) did you have? What is the type/token ratio for the corpus? print('Total number of tokens found in the corpus: ' + str(len(a))) print('Total number of unique Token Types did we have: ' + str(len(set(a)))) print('Type/Token Ratio for a (Tokenized Origianl) (Lexical Diversity): ' + str(len(set(a)) / len(a))) # For each token, print the token and its frequency in a file called Tokens.txt # (from the most frequent to the least frequent) and include the first 100 lines in your report. fdist1 = FreqDist(a) outb.write(str(fdist1.most_common(10000000))) # How many tokens appeared only once in the corpus? print('Total number of tokens found in the corpus only once: ' + str(len(fdist1.hapaxes()))) # From the list of tokens, extract only words, by excluding punctuation and other symbols. # How many words did you find? # List the top 100 most frequent words in your report, with their frequencies. # What is the type/token ratio when you use only word tokens (called lexical diversity) b = [] for tokens in a: a_withoutsymbols = strip_all_entities(strip_links(tokens)) b.append(a_withoutsymbols) print('After stripping the tokens of all symbols, words found: ' + str(len(b))) fdist2 = FreqDist(b) print('The top 100 most common tokens with their frequencies: ' + str(fdist2.most_common(100))) print('Type/Token Ratio for b (only words)(Lexical Diversity): ' + str(len(set(b)) / len(b))) # From the list of words, exclude stopwords. List the top 100 most frequent words and their frequencies. # You can use this list of stopwords (or any other that you consider adequate, or NLTK stopwords [recommended!]). stop_words = set(stopwords.words('english')) filtered_sentence = [] for word in b: if word not in stop_words: filtered_sentence.append(word) fdist3 = FreqDist(filtered_sentence) print('The top 100 most common tokens with their frequencies: ' + str(fdist3.most_common(100))) print( 'Type/Token Ratio for filtered_sentence (stopwords)(only words)(Lexical Diversity): ' + str(len(set(filtered_sentence)) / len(filtered_sentence))) # Compute all the pairs of two consecutive words (excluding stopwords and punctuation). # List the most frequent 100 pairs and their frequencies in your report. # Also compute the type/token ratio when you use only word tokens without stopwords (called lexical density)? bigram_list = list(bigrams(filtered_sentence)) fdist4 = FreqDist(bigram_list) print("The top 100 most common tokens with their frequencies: " + str(fdist4.most_common(100))) print( 'Type/Token Ratio for filtered_sentence (stopwords)(only words)(Lexical Diversity): ' + str(len(set(bigram_list)) / len(bigram_list))) # Extract multi-word expressions (composed of two or more words, so that the meaning of the expression # is more than the composition of the meanings of its words). # Use NLTK and Python (explain how). # List the most frequent 100 expressions extracted. token = filtered_sentence mwe = range_ngrams(token, ngramRange=(1, 6)) fdist5 = FreqDist(mwe) print('The top 100 most common tokens with their frequencies: ' + str(fdist5.most_common(100))) # Closing both output files. outa.close() outb.close()
class Analyzer(object): def __init__(self, text): self.text = text self.token_counts = FreqDist(text) def numberOfTokens(self): # returns number of tokens in the text return len(self.text) def vocabulary(self): # returns a list of the vocabulary of the text sorted alphabetically. return sorted(set(self.text)) def vocabularySize(self): # returns the size of the vocabulary return len(self.vocabulary()) def lexicalRichness(self): # returns the lexical richness of the text return self.numberOfTokens() / self.vocabularySize() def hapaxes(self): # returns all hapaxes of the text''' return self.token_counts.hapaxes() def numberOfHapaxes(self): # returns the number of hapaxes in the text''' return len(self.hapaxes()) def avWordLength(self): # returns the average word length of the text''' sum = 0 for word in self.token_counts: sum = sum + len(word) return (int(sum / self.vocabularySize())) def topSuffixes(self): # returns the 10 most frequent 2-letter suffixes in words''' # restrict to words of length 5 or more freq = {} listsuf = [] for word in self.vocabulary(): if len(word) >= 5: if word[-2:] in freq: freq[word[-2:]] = freq[word[-2:]] + 1 else: freq[word[-2:]] = 1 for key, value in sorted(freq.items(), key=lambda x: x[1], reverse=True): listsuf.append(key) return listsuf[:10] def topPrefixes(self): # returns the 10 most frequent 2-letter prefixes in words''' # restrict to words of length 5 or more freq = {} listpre = [] for word in self.vocabulary(): if len(word) >= 5: if word[:2] in freq: freq[word[:2]] = freq[word[:2]] + 1 else: freq[word[:2]] = 1 for key, value in sorted(freq.items(), key=lambda x: x[1], reverse=True): listpre.append(key) return listpre[:10] def tokensTypical(self): # returns first 5 tokens of the (alphabetically sorted) vocabulary # that contain both often seen prefixes and suffixes in the text. Hint: use topPrefixes() # and topSuffixes() methods toppre = self.topPrefixes() topsuf = self.topSuffixes() listtoken = [] for token in self.vocabulary(): if token[:2] in toppre and token[-2:] in topsuf: listtoken.append(token) return (listtoken[:5])
fd = FreqDist(brown.words()) # Find the most frequent words in a text: # http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary import operator max(fd.iteritems(), key=operator.itemgetter(1)) sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10] # Or use the wrapper function fd.most_common(10) # plot the most frequent words fd.plot(10) fd.plot(10, cumulative=True) # See the words with lowest frequency (these words are called hapaxes) fd.hapaxes() # Count all the words len(text1) # count unique words len(set(text1)) # count unique words, irrespective of word case len(set(w.lower() for w in text1)) # Find the words that are more than 15 characters long words = set(brown.words()) long_words = [w for w in words if len(w) > 15] # Words that are more frequent than 7 times and are more than 7 characters long rare_and_long = sorted(w for w in set(brown.words()) if len(w) > 7 and fd[w] > 7)
tokenized_text = [nltk.word_tokenize(each_case) for each_case in cleaned_text] tokenized_text = [[ stemmer.stem(word) for word in each_case if word not in stopwords ] for each_case in tokenized_text] tot_text = list(chain.from_iterable(tokenized_text)) fdist = FreqDist(tot_text) wordList = fdist.values() wordArray = np.array(wordList) print '50% quantile word count of', np.percentile(wordArray, 50) print fdist.most_common(30) #plotting fdist on a cumulative chart fdist.plot(30, cumulative=True) #plotting fdist on a non cumulative chart fdist.plot(30) print 'seldom appearing words:', fdist.hapaxes() tfidf_text = [] for each_case in tokenized_text: tfidf_text.append(' '.join(word for word in each_case)) #tfidf_text #create a tfidf vectorizer to convert the text into tfidf tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=1.0) tfidf = tfidf_vectorizer.fit_transform(tfidf_text) feature_names = tfidf_vectorizer.get_feature_names() #examining each feature in the document and also their corresponding tfidf for col in tfidf.nonzero()[1]: print feature_names[col], ' - ', tfidf[0, col], ' - ', tfidf.indices[col]
import nltk from nltk.corpus import gutenberg # 导入 gutenberg 集 ################################################################## ## FreqDist 跟踪分布中的采样频率 (sample frequencies) from nltk import FreqDist # 导入 FreqDist 类 fd = FreqDist(gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token print(fd) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token print(type(fd)) # <class 'nltk.probability.FreqDist'> print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典 print(fd.N()) # 98171; 是单词, 不是字母, 有重复的 print(fd.B()) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中 print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'> print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出 print(fd.max()) # 频率最高的一个词 print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171 print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词 # 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征 for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的 if idx == 5: break print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print(fdist.items()) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)]) print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ################################################################## ## 统计 英文字符 fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化 print(fdist.most_common(5)) # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)] print([char for (char, count) in fdist.most_common()]) # 26 个字母使用频率排序
w for w in word_tok.tokenize(text_y) if w.lower() not in stop_words ] words_n_lemmatized = [ger.lemmatise(w) for w in words_n] words_y_lemmatized = [ger.lemmatise(w) for w in words_y] fdistn = FreqDist(words_n) fdisty = FreqDist(words_y) most_common_n = fdistn.most_common(50) most_common_y = fdisty.most_common(50) print(fdistn[i]) print(fdisty[i]) hapax_n = FreqDist.hapaxes(fdistn) hapax_y = FreqDist.hapaxes(fdisty) list_n = [ n[0] for n in most_common_n if n[0][0].isupper() and len(n[0]) > 1 ] list_y = [ n[0] for n in most_common_y if n[0][0].isupper() and len(n[0]) > 1 ] # print(list_n) # print(list_y) print(set(list_n).difference(list_y)) print(set(list_y).difference(list_n))
from nltk import FreqDist # 导入 FreqDist 类 fd = FreqDist( gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token print( fd ) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token print(type(fd)) # <class 'nltk.probability.FreqDist'> print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典 print(fd.N()) # 98171; 是单词, 不是字母, 有重复的 print(fd.B() ) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中 print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'> print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出 print(fd.max()) # 频率最高的一个词 print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171 print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词 # 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征 for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的 if idx == 5: break print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print( fdist.items() ) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)]) print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ################################################################## ## 统计 英文字符 fdist = nltk.FreqDist(ch.lower()
class Analyzer(object): def __init__(self, path): '''reads the file text, creates the list of words (use nltk.word_tokenize to tokenize the text), and calculates frequency distribution ''' with open(path, 'r') as file: self.text = word_tokenize(file.read()) #self.text = word_tokenize(open(path,'r').read()) #TODO the list of words from text file self.token_counts = FreqDist( self.text) #TODO frequency distribution of words from text file def numberOfTokens(self): '''returns number of tokens in the text ''' return len(self.text) def vocabularySize(self): '''returns a list of the vocabulary of the text ''' return len(self.token_counts) def lexicalDiversity(self): '''returns a list of lexical diversity of the text ''' # Höhe Diversity: mehr unterschiedliche Wörter return self.numberOfTokens() / self.vocabularySize() def getKeywords(self): '''return words as possible key words, that are longer than seven characters, that occur more than seven times (sorted alphabetically)''' keys = [] for key, value in self.token_counts.items(): if len(key) > 7 and value > 7: keys.append(key) return sorted(keys) #Musterlösung : iterier Typs , and iterier das Filter #return sorted([w for w in self.token_counts.keys() if len(w)>7 and self.token_counts[w]>7]) def numberOfHapaxes(self): '''returns the number of hapaxes in the text''' return len(self.token_counts.hapaxes()) def avWordLength(self): #此处题目表述为 所有不同词汇的平均值,而不是所有词汇的平均值'''returns the average word length of the text''' #Musterlösung #return sum([len(word) for word in self.token_counts])/len(self.token_counts) sumWordLen = 0 for word in self.token_counts: sumWordLen = sumWordLen + len(word) return sumWordLen / len(self.token_counts) def topSuffixes(self): '''returns the 10 most frequent 2-letter suffixes in words (restrict to words of length 5 or more)''' #Musterlösung #list_of_words = [word for word in self.token_counts if len(word) >=5] #suf_dict = FreqDist(suf[-2:] for suf in list_of_words) #suf_most_freq =[elem[0] for elem in suf_dict.most_common(10)] #return suf_most_freq suffixes = [] for langWord in self.token_counts.keys(): if len(langWord) >= 5: suffixes.append(langWord[-2:]) return [word for word, count in Counter(suffixes).most_common(10)] def topPrefixes(self): '''returns the 10 most frequent 2-letter prefixes in words (restrict to words of length 5 or more)''' suffixes = [] for langWord in self.token_counts.keys(): if len(langWord) >= 5: suffixes.append(langWord[:2]) return [word for word, count in Counter(suffixes).most_common(10)] def tokensTypical(self): """TODO returns first 5 tokens of the (alphabetically sorted) vocabulary that contain both often sccleen prefixes and suffixes in the text. As in topPrefixes() and topSuffixes(), Prefixes and Suffixes are 2 characters long.""" sufixes = self.topSuffixes() prefixes = self.topPrefixes() return sorted([ word for word in self.token_counts.keys() if word[:2] in prefixes and word[-2:] in sufixes ])[:5]
from nltk.corpus import stopwords #Definimos el idioma stoplist = stopwords.words('spanish') #Frase con mucha basura test_text = "El a ante con contra desde en un a el la o y puede que no jamón" #Se tokeniza la frase y se compara cada palabra con la lista de stopwordsself #Nos quedamos con la lista limpia clean_text = [ word for word in regexp_tokenize(test_text, '\w+') if word.lower() not in stoplist ] print(clean_text) '''########################### # Eliminar palabras raras # ########################### Por que no ayuda tener nombres o palabras muy cortas/largas ''' from nltk import FreqDist # Se calcula la distancia entre las repeticiones # de cada palabra, de forma que si no es frecuente # es decir, una palabra rara, se quitará. frecuencia_distancia = FreqDist(tokens) raras = frecuencia_distancia.hapaxes() limpieza_raras = [word for word in tokens if word not in raras] print(tokens) print(limpieza_raras)
import nltk from nltk import FreqDist from nltk.corpus import brown from nltk.corpus import inaugural from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import math brown_freq = FreqDist(brown.words()) print(brown_freq.most_common(10)) print(brown_freq["mother"]) for word in brown_freq.most_common(): print("{} ~ {}".format("the", round(brown_freq.freq("the"), 2))) print(sorted(brown_freq.hapaxes(), key=lambda w: len(w), reverse=True)[:20]) cats = ['mystery', 'adventure'] cfd = nltk.ConditionalFreqDist((genre, word.lower()) for genre in cats for word in brown.words(categories=genre)) print(cfd) for cond in cfd.conditions(): print(cond) print(cfd[cond].most_common(20)) print() for cond in cfd.conditions(): print("mother in {} - {} - {}".format(cond, cfd[cond]["mother"], round(cfd[cond].freq("mother"), 4)))
def percentage(count, total): return 100 * count / total lexical_diversity(text4) percentage(text4.count('a'), len(text4)) # Simple statistics from nltk import FreqDist # Counting Words Appearing in a Text (a frequency distribution) fdist1 = FreqDist(text4) fdist1 vocabulary1 = fdist1.keys() # list of all the distinct types in the text vocabulary1[:3] # look at first 3 #words that occur only once, called hapaxes fdist1.hapaxes()[:20] # Words that meet a condition, are long for example V = set(text4) long_words = [w for w in V if len(w) > 15] sorted(long_words) #finding words that characterize a text, relatively long, and occur frequently fdist = FreqDist(text4) sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7]) # Collocations and Bigrams. # A collocation is a sequence of words that occur together unusually often. # Built in collocations function text4.collocations()
from nltk.book import text1 from nltk.book import text4 from nltk import FreqDist import nltk Freq_Dist = FreqDist(text1) print(Freq_Dist) print(Freq_Dist.most_common(10)) print(Freq_Dist['his']) Freq_Dist.plot(50, cumulative=False) Freq_Dist.plot(50, cumulative=True) Freq_Dist.hapaxes() Once_happend = Freq_Dist.hapaxes() print(Once_happend) print(text4.count('america') / float(len(text4) * 100)) Value_set = set(text1) long_words = [words for words in Value_set if len(words) > 17] print(sorted(long_words)) my_text = ["Here", "are", "some", "words", "that", "are", "in", "a", "list"] vocab = sorted(set(my_text)) print(vocab) word_freq = nltk.FreqDist(my_text) print(word_freq.most_common(5))