def ch03_42_wordnet_semantic_index(): from nltk.corpus import webtext from nltk.corpus import wordnet as wn postings = [] docids = {} for (pos, fileid) in enumerate(webtext.fileids()): docids[pos] = fileid wpos = 0 words = webtext.words(fileid) for word in words: try: postings.append((word.lower(), (pos, wpos))) offset = wn.synsets(word)[0].offset postings.append((offset, (pos, wpos))) poffset = wn.synsets(word)[0].hypernyms()[0].offset postings.append((poffset, (pos, wpos))) except IndexError: continue wpos = wpos + 1 index = nltk.Index(postings) query = "canine" qpostings = [] qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]]) try: offset = wn.synsets(query)[0].offset qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]]) except IndexError: pass for (pos, wpos) in qpostings: left = webtext.words(docids[pos])[wpos-4:wpos] right = webtext.words(docids[pos])[wpos:wpos+4] print left, right
def ch03_42_wordnet_semantic_index(): from nltk.corpus import webtext from nltk.corpus import wordnet as wn postings = [] docids = {} for (pos, fileid) in enumerate(webtext.fileids()): docids[pos] = fileid wpos = 0 words = webtext.words(fileid) for word in words: try: postings.append((word.lower(), (pos, wpos))) offset = wn.synsets(word)[0].offset postings.append((offset, (pos, wpos))) poffset = wn.synsets(word)[0].hypernyms()[0].offset postings.append((poffset, (pos, wpos))) except IndexError: continue wpos = wpos + 1 index = nltk.Index(postings) query = "canine" qpostings = [] qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]]) try: offset = wn.synsets(query)[0].offset qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]]) except IndexError: pass for (pos, wpos) in qpostings: left = webtext.words(docids[pos])[wpos - 4:wpos] right = webtext.words(docids[pos])[wpos:wpos + 4] print left, right
def save_other_grams(): HIGH_FREQ_UNI = 0.01 HIGH_FREQ_BI = 0.02 HIGH_FREQ_TRI = 0.02 other_corpus_unigrams = [ w.lower() for w in (gutenberg.words() + brown.words() + webtext.words()) ] other_corpus_freq_unigrams = high_freq(other_corpus_unigrams, HIGH_FREQ_UNI) output = open('unigrams_data.pkl', 'wb') dump(other_corpus_freq_unigrams, output, -1) output.close() other_corpus_bigrams = nltk.bigrams(other_corpus_unigrams) other_corpus_freq_bigrams = high_freq(other_corpus_bigrams, HIGH_FREQ_BI) output = open('bigrams_data.pkl', 'wb') dump(other_corpus_freq_bigrams, output, -1) output.close() other_corpus_trigrams = nltk.trigrams(other_corpus_unigrams) other_corpus_freq_trigrams = high_freq(other_corpus_trigrams, HIGH_FREQ_TRI) output = open('trigrams_data.pkl', 'wb') dump(other_corpus_freq_trigrams, output, -1) output.close()
def add_known_words(self): """Add known words to the spellchecker from external and internal files""" # adding known words file if given - these words will not count as misspelled if self.known_words_file_paths: for known_words_file_path in self.known_words_file_paths: self.spellchecker.word_frequency.load_text_file( known_words_file_path) # adding the KNOWN_WORDS to the spellchecker recognized words. self.spellchecker.word_frequency.load_words(KNOWN_WORDS) if self.expand_dictionary: # nltk - natural language tool kit - is a large package containing several dictionaries. # to use it we need to download one of it's dictionaries - we will use the # reasonably sized "brown" and "webtext" dicts. # to avoid SSL download error we disable SSL connection. try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context # downloading "brown" and "webtext" sets from nltk. click.secho( "Downloading expanded dictionary, this may take a minute...", fg='yellow') nltk.download('brown') nltk.download('webtext') # adding nltk's word set to spellchecker. self.spellchecker.word_frequency.load_words(brown.words()) self.spellchecker.word_frequency.load_words(webtext.words())
def freq(arquivo): palavras = webtext.words(arquivo) that = nltk.FreqDist([s for s in token(palavras) if s == 'that']) #that.plot(cumulative = True) the = nltk.FreqDist([s for s in token(palavras) if s == 'the']) #the.plot(cumulative = True) return print(f'O arquivo {arquivo} possui {that["that"]} "that" e {the["the"]} "the".')
def main(): text = webtext.words("grail.txt") # Dein Code¨ for words in text: words = words.lower() if len(words) == word_length: if words_with_wordlength.has_key(words): value = words_with_wordlength[words] words_with_wordlength[words] = value + 1 else: words_with_wordlength[words] = 1 if words[0] == letter: if words_with_letter.has_key(words): value = words_with_letter[words] words_with_letter[words] = value + 1 else: words_with_letter[words] = 1 sorted_words_wordlength = sorted(words_with_wordlength.items(), key=operator.itemgetter(1), reverse=True) print "Häufigstes Wort mit", word_length, "Buchstaben:" print sorted_words_wordlength[0][0], ":", sorted_words_wordlength[0][1] sorted_words_letter = sorted(words_with_letter.items(), key=operator.itemgetter(1), reverse=True) print "Häufigstes Wort mit Anfangsbuchstabe", letter, ":" print sorted_words_letter[0][0], ":", sorted_words_letter[0][1] for words in sorted_words_wordlength: if words[0][0] == letter: print "Häufigstes Wort mit", word_length, "Buchstaben und Angangsbuchstabe", letter + ":" print words[0], ":", words[1] break
def generateSentence(): corpus = random.randint(0,3) if corpus == 0: text = brown.words() elif corpus == 1: text = gutenberg.words() elif corpus == 2: text = webtext.words() elif corpus == 3: text = movie_reviews.words() tweetString = '' lengthOfTweet = random.randint(0,20) len(text) firstRun = True blank = ' ' startOfWord = '' startOfWordIndex = 0 startingWord = random.randint(0, (len(text) - 40)) punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"] for x in xrange(startingWord,(startingWord + len(text))): startOfWord = text[x] if startOfWord ==".": startOfWordIndex = x break for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet): if text[x] in punctuation: tweetString = tweetString + text[x] elif text[x] not in punctuation: tweetString = tweetString + blank + text[x] return tweetString
def demo(scorer=None, compare_scorer=None): """Finds bigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def frequency(filter, arq): for p in filter: print(f'Arquivo {arq}' f' e frequência da palavra {p}' f' {nltk.FreqDist(webtext.words(arq))[p]}') return webtext.words(arq)
def demo(scorer=None, compare_scorer=None): """Finds trigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def raw_word_generator(): from nltk.corpus import webtext, reuters, brown, gutenberg return (w.lower() for w in itertools.chain( brown.words(), webtext.words(), reuters.words(), gutenberg.words(), ) if w.isalnum())
def get_bigrams(filelocation, ratio): '''BigramCollocationFinder constructs two frequency distributions: one for each word, and another for bigrams.''' words = [w.lower() for w in webtext.words(filelocation)] stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) return bcf.nbest(BigramAssocMeasures.likelihood_ratio, ratio)
def get_trigrams(filelocation, ratio): '''In addition to BigramCollocationFinder, there's also TrigramCollocationFinder, which finds triplets instead of pairs.''' words = [w.lower() for w in webtext.words(filelocation)] stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, ratio)
def load_data(folder_name): filter_list = ['âˇ','ŕş','đžń','é','ĺ','é','ę','ŕť','đžđťđžđłđ','ľŕš','şá','řşů','ç','żŕ','ŕ','î','ŕž','ď','ďż','ŕż','ă','ŕˇ','łŕ','ŕľ','ąŕ','l','ŕ','ŕś','','ŕľ','á','ŕž','ŕ','ů','ř','ŕš','đˇđ','őťőľö','őľő','â','ôźőťőłőąőľőť','đľ','ä','đ','ő','ö','ń','đťđ','đž','post','date','nbsp','cc','âśăłă','âťăšr','âšâś','âťăšr','âšâś','âśâąâ','âśâąâ','âťasian','âťasian','âś','âśăłă','nfsâ','â','nov','com','oct','octn','theâ','aimăš','maniăšre','cm','http','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] path_name_lists = glob.glob(ROOT_DIR + folder_name) results = [] # Loading the data for path_name in path_name_lists: words = [w.lower() for w in webtext.words(path_name)] filter_words = [word for word in words if word not in filter_list and word.isalpha()] results.extend(filter_words) print(len(results)) return results
def TestSim_irrelevant_corpus(): ''' Webtext: Wine ''' wine = ' '.join(webtext.words('wine.txt')) tokens = tokenize(wine) tokens = [ tokens[i * 100:(i + 1) * 100] for i in range(int(len(tokens) / 100)) ] vecs = [text2Vec(' '.join(token))[0] for token in tokens] print(len(vecs)) print(vecs[0])
def webtext(): from nltk.corpus import webtext as webtext from nltk.corpus import nps_chat # list comprehension version file_ids = [fileid for fileid in webtext.fileids()] chat_file_ids = [fileid for fileid in nps_chat.fileids()] pirates = webtext.raw('pirates.txt') pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) lexical_diversity = lexical_div(uniqs, pirates_words) # import nltk.book as book # text1 = book.text1 # pirates = webtext.raw('pirates.txt') return render_template('webtext.html', file_ids=file_ids, chat_file_ids=chat_file_ids, pirates=pirates)
def fun_2_1(): from nltk.util import ngrams from nltk.corpus import alpino # Unigram (一元语法)代表单个标识符。以下代码用于为 Alpino 语料库生成 unigrams print alpino.words() unigrams = ngrams(alpino.words(), 1) for i in unigrams: # print i pass # 考虑另一个有关从 alpino 语料库生成 quadgrams 或 fourgrams (四元语法)的例子 unigrams = ngrams(alpino.words(), 4) for i in unigrams: # print i pass # bigram(二元语法)指的是一对标识符。为了在文本中找到 bigrams,首先需要搜索 # 小写单词,把文本创建为小写单词列表后,然后创建 BigramCollocationFinder 实例。 # 在 nltk.metrics 包中找到的 BigramAssocMeasures 可用于在文本中查找 bigrams from nltk.collocations import BigramCollocationFinder from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) print words.nbest(BigramAssocMeasures.likelihood_ratio, 10) # 在上面的代码中,我们可以添加一个用来消除停止词和标点符号的单词过滤器 from nltk.corpus import stopwords set1 = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in set1 words.apply_word_filter(stops_filter) print words.nbest(BigramAssocMeasures.likelihood_ratio, 10) # 这里,我们可以将 bigrams 的频率更改为其他数字。 # 另一种从文本中生成 bigrams 的方法是使用词汇搭配查找器,如下代码所示 import nltk text1 = "Hardwork is the key to success. Never give up!" word = nltk.tokenize.wordpunct_tokenize(text1) finder = BigramCollocationFinder.from_words(word) bigram_measures = nltk.collocations.BigramAssocMeasures() value = finder.score_ngrams(bigram_measures.raw_freq) print sorted(bigram for bigram, score in value) # 为了生成 fourgrams 并生成 fourgrams 的频率,可以使用如下代码 text = "Hello how are you doing ? I hope you find the book interesting" tokens = nltk.wordpunct_tokenize(text) fourgrams = nltk.collocations.QuadgramCollocationFinder.from_words(tokens) for fourgram, freq in fourgrams.ngram_fd.items(): print(fourgram, freq)
def _english_word_frequencies(): """ Get frequencies of english words based on four corpora: Gutenberg Corpus, Web and Chat Text, Brown Corpus, Reuters Corpus. Returns: tuple: Frequencies of words based on Gutenberg, Web and Chat Text, Brown and Reuters corpora, respectively """ gutenberg_freqs = FreqDist(gutenberg.words()) webtext_freqs = FreqDist(webtext.words()) brown_freqs = FreqDist(brown.words()) reuters_freqs = FreqDist(reuters.words()) return gutenberg_freqs, webtext_freqs, brown_freqs, reuters_freqs
def extract_bigrams(file_toanalyze, word_length, num_of_bigrams): # get the list of words from the file words_list = [word.lower() for word in webtext.words(file_toanalyze)] # construct a finder object to find the best bigrams finder = BigramCollocationFinder.from_words(words_list) # create a noise filtering handler noise_handler = filter_word_noise(word_length) # apply the noise filtering handler finder.apply_word_filter(noise_handler) # actually find the desired number of bigrams list_of_bigrams = finder.nbest(BigramAssocMeasures.likelihood_ratio, num_of_bigrams) # return the list of bigrams return list_of_bigrams
def main(): # store word lengths brown_word_lens = [] web_word_lens = [] inaugural_word_lens = [] gutenberg_word_lens = [] genesis_word_lens = [] for file in gutenberg.fileids(): for word in gutenberg.words(file): gutenberg_word_lens.append(len(word)) for file in brown.fileids(): for word in brown.words(file): brown_word_lens.append(len(word)) for file in webtext.fileids(): for word in webtext.words(file): web_word_lens.append(len(word)) for file in inaugural.fileids(): for word in inaugural.words(file): inaugural_word_lens.append(len(word)) for file in genesis.fileids(): for word in genesis.words(file): genesis_word_lens.append(len(word)) with open("wordlens.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), len(web_word_lens), len(brown_word_lens), len(gutenberg_word_lens))): for corpus in [genesis_word_lens, inaugural_word_lens, web_word_lens, brown_word_lens, gutenberg_word_lens]: if(i >= len(corpus)): f.write(",") else: f.write(str(corpus[i]) + ",") f.write("\n")
def demo(scorer_bam=None, compare_scorer_bam=None, scorer_tam=None, compare_scorer_tam=None): if scorer_bam is None: scorer_bam = BigramAssocMeasures.likelihood_ratio if compare_scorer_bam is None: compare_scorer_bam = BigramAssocMeasures.raw_freq if scorer_tam is None: scorer_tam = TrigramAssocMeasures.likelihood_ratio if compare_scorer_tam is None: compare_scorer_tam = BigramAssocMeasures.raw_freq regex = '^[A-Za-z]+$' #正则表达式匹配英文单词 str_regex = re.compile(regex) for file in webtext.fileids(): # 根据文件逐个处理 words_list = [] for word in webtext.words(file): if not str_regex.match(word): #如果不是纯英文单词,则跳过 continue words_list.append(word) # 获取二元搭配,窗口大小为3,4,5 for window_size in range(3, 4): bcf = BigramCollocationFinder.from_words(words_list, window_size) bcf.apply_freq_filter(window_size) for item in bcf.nbest(scorer_bam, 1000): get_collocation(item) #获取搭配次词 # 获取三元搭配 for window_size in range(3, 4): tcf = TrigramCollocationFinder.from_words(words_list, window_size) tcf.apply_freq_filter(window_size) # tcf.apply_word_filter(word_filter) #corr = spearman_correlation(ranks_from_scores(tcf.score_ngrams(scorer)), # ranks_from_scores(tcf.score_ngrams(compare_scorer))) for item in tcf.nbest(scorer_tam, 1000): get_collocation(item)
def generateSentence(): corpus = random.randint(0, 3) if corpus == 0: text = brown.words() elif corpus == 1: text = gutenberg.words() elif corpus == 2: text = webtext.words() elif corpus == 3: text = movie_reviews.words() tweetString = '' lengthOfTweet = random.randint(0, 20) len(text) firstRun = True blank = ' ' startOfWord = '' startOfWordIndex = 0 startingWord = random.randint(0, (len(text) - 40)) punctuation = [ ".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#" ] for x in xrange(startingWord, (startingWord + len(text))): startOfWord = text[x] if startOfWord == ".": startOfWordIndex = x break for x in xrange(startOfWordIndex + 1, startOfWordIndex + lengthOfTweet): if text[x] in punctuation: tweetString = tweetString + text[x] elif text[x] not in punctuation: tweetString = tweetString + blank + text[x] return tweetString
import nltk, matplotlib from nltk.corpus import webtext print(webtext.fileids()) fileid = 'singles.txt' wbt_words = webtext.words(fileid) fdist = nltk.FreqDist(wbt_words) print('최대 발생 토큰 "', fdist.max(), '" 수 : ', fdist[fdist.max()]) print('말뭉치 내 총 고유 토큰 수 : ', fdist.N()) print('말뭉치에서 가장 흔한 10개 단어는 다음과 같습니다.') print(fdist.most_common(10)) print('개인 광고의 빈도 분포') print(fdist.tabulate()) fdist.plot(cumulative=True)
from nltk import pos_tag, sent_tokenize, word_tokenize from nltk.corpus import stopwords, brown, words, webtext from nltk.stem import WordNetLemmatizer from collections import defaultdict import string LEX_ASIAN = {"Chinese", "Japanese", "Korean", "China", "Japan", "Korea"} LEX_EUROPEAN = {"French", "France", "Spain", "Spanish"} WORD_SET = set(webtext.words()) | set(list(string.punctuation)) def load_dataset(filepath): """ Returns a list of docs from the given filepath. Parameters: filepath -- (str) Returns: ------- list of filenames: (str) """ file_list = [] with open(filepath) as f: for line in f: file_list.append(line.strip()) return file_list def get_feature_dict(text):
text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words('chesterton-thursday.txt')) print("text9:", text9.name) def texts(): print("text1:", text1.name) print("text2:", text2.name) print("text3:", text3.name)
# In[9]: finder.nbest(bigram_measures.pmi, 10) # In[10]: from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures # In[11]: words = [w.lower() for w in webtext.words('grail.txt')] # In[12]: bcf = BigramCollocationFinder.from_words(words) # In[13]: bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) # Eliminating Stopwords # In[14]:
from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures words = [w.lower() for w in webtext.words('grail.txt')] bcf = BigramCollocationFinder.from_words(words) print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)) # remove punctuation and stopwords from nltk.corpus import stopwords stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)) # trigrams from nltk.collocations import TrigramCollocationFinder from nltk.metrics import TrigramAssocMeasures words = [w.lower() for w in webtext.words('singles.txt')] tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) print(tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4))
#!/usr/bin/env python # -*- coding: utf-8 -*- #PCL I, Übung 5, HS15 #Aufgabe 3.1c #Autor: Bill Bosshard #Matrikel-Nr.: 12-933-255 #Autor: Lukas Vollenweider #Matrikel-Nr.: 13-751-888 import nltk from nltk.book import * from nltk.corpus import webtext text6 = webtext.words("grail.txt") def findVerbs(text): verbs = [] for word in set(text): if len(word) >= 4 and word[-3:] == "ing": verbs.append(word.lower()) return sorted(verbs) if __name__ == "__main__": print findVerbs(text6)
text1 = Text(gutenberg.words("melville-moby_dick.txt")) print("text1:", text1.name) text2 = Text(gutenberg.words("austen-sense.txt")) print("text2:", text2.name) text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words("chesterton-thursday.txt")) print("text9:", text9.name) def texts(): print("text1:", text1.name) print("text2:", text2.name)
import nltk from nltk.collocations import BigramCollocationFinder from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
from nltk.corpus import webtext from nltk.corpus import stopwords # ### Corpora # Each corpus has different files containing some text. To get a list of such files of e.g. the above imported wordnet (as wn) corpus, run: # In[3]: print(wordnet.fileids()) # To get the list of words inside a corpus we use the .words() method: # In[4]: print(webtext.words()) # ### Wordnet (OMW) => Synset basics # <p><b>Synset</b> are wordnet instances grouping synonymous words that express the same concept.</p> # In[5]: syn = wordnet.synsets('fantasma', lang='ita') print(syn) # In[6]: print("NAME: ", syn[0].name()) print("DEFINITION: ", syn[0].definition()) print("EXAMPLES: ", syn[0].examples())
def text8(): text = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text.name) return text
#!/usr/bin/env python # -*- coding: utf-8 -*- #PCL I, Übung 5, HS15 #Aufgabe 3.2 #Autor: Bill Bosshard #Matrikel-Nr.: 12-933-255 #Autor: Lukas Vollenweider #Matrikel-Nr.: 13-751-888 import nltk from nltk.corpus import webtext text6 = webtext.words('grail.txt') def long_words(text): return sorted([word.lower() for word in set(text) if word >= 7 and word[-3:] == "ing"]) def tuples(text): return [(word, len(word)) for word in set(text)] def trigrams(text): return [(text[i-2], text[i-1], text[i]) for i in range(2, len(text))] print "long_words: ",long_words(text6)[:10] print "tuples: ", tuples(text6)[:10] print "trigrams: ", trigrams(text6)[:10]
def main(): # store word lengths brown_common_freq = [] web_common_freq = [] inaugural_common_freq = [] gutenberg_common_freq = [] genesis_common_freq = [] common = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"] common.sort() for file in gutenberg.fileids(): total_words = len(gutenberg.words(file)) total_common = 0 for word in gutenberg.words(file): if word.lower() in common: total_common += 1 gutenberg_common_freq.append(float(total_common)/total_words) for file in brown.fileids(): total_words = len(brown.words(file)) total_common = 0 for word in brown.words(file): if word.lower() in common: total_common += 1 brown_common_freq.append(float(total_common)/total_words) for file in webtext.fileids(): total_words = len(webtext.words(file)) total_common = 0 for word in webtext.words(file): if word.lower() in common: total_common += 1 web_common_freq.append(float(total_common)/total_words) for file in inaugural.fileids(): total_words = len(inaugural.words(file)) total_common = 0 for word in inaugural.words(file): if word.lower() in common: total_common += 1 inaugural_common_freq.append(float(total_common)/total_words) for file in genesis.fileids(): total_words = len(genesis.words(file)) total_common = 0 for word in genesis.words(file): if word.lower() in common: total_common += 1 genesis_common_freq.append(float(total_common)/total_words) with open("common-words.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq), len(web_common_freq), len(brown_common_freq), len(gutenberg_common_freq))): for corpus in [genesis_common_freq, inaugural_common_freq, web_common_freq, brown_common_freq, gutenberg_common_freq]: if i >= len(corpus): f.write(",") else: f.write(str(round(corpus[i], 5)) + ",") f.write("\n")
print(e) for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid) from nltk.corpus import webtext import nltk for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65]) print(type(webtext.words(fileid))) from nltk.corpus import brown brown.categories() t = brown.words(categories='news') print(t) fdist = nltk.FreqDist([w.lower() for w in t]) fdist1 = nltk.FreqDist(t) print(type(fdist)) print(fdist) print(fdist1['May']) for f in fdist.items():
from nltk.corpus import brown, webtext # Brown corpus print('Categories:', list(brown.categories())) print('Brown sample text:\n\t', ' '.join(brown.words(categories='adventure')[:50])) # Webtext corpus print() print('Categories:', webtext.fileids()) print('Webtext sample text:\n\t', ' '.join(webtext.words('firefox.txt')[:50]))
def main(): #store FreqDist's #index is the length of the word, 0 is for all words samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" brown_letters = FreqDist() web_letters = FreqDist() inaugural_letters = FreqDist() gutenberg_letters = FreqDist() genesis_letters = FreqDist() for file in gutenberg.fileids(): for word in gutenberg.words(file): for character in word: if(character in string.letters): gutenberg_letters[character.upper()] += 1 for file in brown.fileids(): for word in brown.words(file): for character in word: if(character in string.letters): brown_letters[character.upper()] += 1 for file in webtext.fileids(): for word in webtext.words(file): for character in word: if(character in string.letters): web_letters[character.upper()] += 1 for file in inaugural.fileids(): for word in inaugural.words(file): for character in word: if(character in string.letters): inaugural_letters[character.upper()] += 1 for file in genesis.fileids(): for word in genesis.words(file): for character in word: if(character in string.letters): genesis_letters[character.upper()] += 1 with open("genesis-letter-freq.txt",'w') as f: sys.stdout = f f.write("GENESIS\n") for let in samples: print(str(genesis_letters[let])) with open("gutenberg-letter-freq.txt", 'w') as f: sys.stdout = f f.write("GUTENBERG\n") for let in samples: print(str(gutenberg_letters[let])) with open("webtext-letter-freq.txt", 'w') as f: sys.stdout = f f.write("WEBTEXT\n") for let in samples: print(str(web_letters[let])) with open("inaugural-letter-freq.txt", 'w') as f: sys.stdout = f f.write("INAUGURAL\n") for let in samples: print(str(inaugural_letters[let])) with open("brown-letter-freq.txt", 'w') as f: sys.stdout = f f.write("BROWN\n") for let in samples: print(str(brown_letters[let])) with open("letter-freq.txt", 'w') as f: corpora = [gutenberg_letters, web_letters, inaugural_letters, brown_letters, genesis_letters] f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n") for let in samples: for corpus in corpora: f.write(str(corpus[let]) + ",") f.write("\n")
def main_process(api,text, token_key, token_key_secret): #print 'text',text; # Used when tokenizing words sentence_re = r'''(?x) # set flag to allow verbose regexps ([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' #stemmer = nltk.stem.porter.PorterStemmer() #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) #print postoks #print 'sbl0'; #print text; #print toks; #print postoks; tree = chunker.parse(postoks) #print 'sbl0'; #from nltk.corpus import stopwords #stopwords = stopwords.words('english') terms = get_terms(tree) key_words = [] key_words_list = [] for term in terms: key_words = [] for word in term: key_words.append(word) print word, key_words_list.append(key_words) print #get user instance on Twitter #api = twitter.Api(consumer_key='TuLPEoqwSkiVreWEODQ6tA',consumer_secret='LnPHHrMOiPVX5PlObJKryROYtdC3475Xq0WJ2tHlJHM',access_token_key=access_token['oauth_token'],access_token_secret=access_token['oauth_token_secret']) #api = twitter.Api(consumer_key='TuLPEoqwSkiVreWEODQ6tA',consumer_secret='LnPHHrMOiPVX5PlObJKryROYtdC3475Xq0WJ2tHlJHM',access_token_key=token_key,access_token_secret=token_key_secret) keyword_recommenders = {} people_interests = {} followers = api.GetFollowers() keywords_count_all = [] keywords_count = [] person_count = {} if len(followers) == 0: print "No followers!"; sys.exit(1) #print 'sbl1'; classifier = train_classifier(make_full_dict) for person in followers: timeline = api.GetUserTimeline(person.id) keywords_count_all = [] for status in timeline: keywords_count = [] coin_count = 0 for term in key_words_list: tmp_count = 0 for word in term: tmp_count = tmp_count + status.text.lower().count(word) if tmp_count > 0: coin_count = coin_count + 1 keywords_count.append(tmp_count) for k_count in keywords_count: if k_count > 0: k_count = k_count + coin_count if sum(keywords_count) > 0: sentiment = guess_sentiment(status.text, classifier) print sentiment; if sentiment == 'neg' and sentiment_enable == 1: continue if keywords_count_all == []: keywords_count_all = keywords_count else: for i in range(0, len(keywords_count_all)): keywords_count_all[i] = keywords_count_all[i] + keywords_count[i] person_count[person.name] = keywords_count_all #print 'sbl2'; for i in range(0,len(key_words_list)): term = key_words_list[i] key_word = '' for word in term: if key_word == []: key_word = key_word + word else: key_word = key_word + ' ' + word recommenders_weight = {} for person in followers: #print i,len(person_count[person.name]); #print person.name; if person_count[person.name]: recommenders_weight[person.name] = person_count[person.name][i] recommenders_weight = sorted(recommenders_weight.iteritems(), key=lambda d:d[1], reverse = True) j = 0 recommenders_weight_sort = {} for pair in recommenders_weight: j = j + 1 if j > 3 or pair[1] == 0: break recommenders_weight_sort[pair[0]] = [pair[1]] if len(recommenders_weight_sort) > 0: keyword_recommenders[key_word] = recommenders_weight_sort followers = sorted(followers, key = lambda follower: follower.followers_count, reverse = True) connectors = [] connectors.append(followers[0]) if len(followers) > 1: connectors.append(followers[1]) print "connector:" for connector in connectors: print connector.name; replyNum = {} replys = api.GetReplies() for reply in replys: if reply.in_reply_to_screen_name != None: reply.in_reply_to_screen_name = api.GetUsersSearch(reply.in_reply_to_screen_name)[0].name if reply.in_reply_to_screen_name in replyNum: replyNum[reply.in_reply_to_screen_name] = replyNum[reply.in_reply_to_screen_name] + 1 else: replyNum[reply.in_reply_to_screen_name] = 1 connectors_info = {} if len(keyword_recommenders) > 0: for key in keyword_recommenders: closeness = {} for person in keyword_recommenders[key]: if person in replyNum: closeness[person] = replyNum[person] else: closeness[person] = 0 closeness = sorted(closeness.iteritems(), key=lambda d:d[1], reverse = True) j = 0 for pair in closeness: j = j + 1 keyword_recommenders[key][pair[0]].append(j) closeness = {} for connector in connectors: if connector.name in replyNum: closeness[connector.name] = replyNum[connector.name] else: closeness[connector.name] = 0 closeness = sorted(closeness.iteritems(), key=lambda d:d[1], reverse = True) j = 0 #print 'closeness:',closeness; for pair in closeness: j = j + 1 for connector in connectors: if connector.name == pair[0]: connectors_info[pair[0]] = [connector.followers_count, j] break #def extract_keyword(): corpus = webtext.words() corpus_length = len(corpus) people_interest_words = {} for person in followers: timeline = api.GetUserTimeline(person.id) status_all = '' for status in timeline: if status_all == '': status_all = status.text else: status_all = status_all + '. ' + status.text status_all = status_all.lower() timeline_length = len(status_all) toks = nltk.regexp_tokenize(status_all, sentence_re) postoks = nltk.tag.pos_tag(toks) tree = chunker.parse(postoks) terms = get_terms(tree) interest_words_prob = {} interest_words = [] for term in terms: interest_word = '' for word in term: if interest_word == '': interest_word = word else: interest_word = interest_word + ' ' + word if len(interest_word) > 1: interest_words.append(interest_word) for phrase in interest_words: status_count = status_all.count(phrase) if corpus_enable == 1: corpus_count = corpus.count(phrase) else: corpus_count = 1 if corpus_count == 0: corpus_count = 1; interest_words_prob[phrase] = status_count * corpus_length / corpus_count / timeline_length interest_words_prob = sorted(interest_words_prob.iteritems(), key=lambda d:d[1], reverse = True) j = 0 tmp = [] for pair in interest_words_prob: j = j + 1 if j > 4: break tmp.append(pair[0]) people_interest_words[person.name] = tmp #print people_interest_words keys = keyword_recommenders people = people_interest_words #visualisation #print 'mavens'; fig_num = 0 for k in keys.keys(): fig_num = fig_num + 1 if connectors_info.keys(): fig_num = fig_num + 1 max_volumn = 2 max_row = fig_num / max_volumn + fig_num % max_volumn #mavens i = 0 for k in keys.keys(): #if keys[k]: #print 'exist mavens'; i = i + 1 G = nx.Graph() #print G node_size_list = [] node_color_list = [] nodes_list = [] G.add_node(k) nodes_list.append(k) #print G.node node_size_list.append(5000) node_color_list.append('r') for name in keys[k].keys(): G.add_node(name) nodes_list.append(name) G.add_edge(k,name,weight = (1.0 / keys[k][name][1])*0.5) node_size_list.append(keys[k][name][0]*1500) node_color_list.append('y') if name in people.keys(): for itrst in people[name]: if itrst != k and itrst != name: if itrst in nodes_list: continue else: G.add_node(itrst) nodes_list.append(itrst) G.add_edge(name,itrst,weight=3) node_size_list.append(1500) node_color_list.append('c') plt.subplot(max_row,max_volumn,i) plt.title('maven') #for i in range(len(node_size_list)): # node_size_list[i] = node_size_list[i] / sum(node_size_list) * 100 #nx.draw_networkx(G,pos=nx.spring_layout(G),with_labels=True,nodelist=nodes_list,node_size=node_size_list,node_color=node_color_list,tick_labels=False) nx.draw(G,pos=nx.spring_layout(G),title='mavens', with_labels=True,nodelist=nodes_list,node_size=node_size_list,node_color=node_color_list,tick_labels=False) #plt.show() #print 'connectors'; #connectors user = api.VerifyCredentials() username = user.name if connectors_info.keys(): #print 'connectors_info_keys()'; G = nx.Graph() i = i + 1 node_size_list = [] node_color_list = [] nodes_list = [] G.add_node(username) nodes_list.append(username) node_size_list.append(5000) node_color_list.append('r') for c in connectors_info.keys(): #print c; G.add_node(c) nodes_list.append(c) G.add_edge(c,username,weight = (1.0 / connectors_info[c][1])*500) node_size_list.append(connectors_info[c][0] * 700) node_color_list.append('y') #plt.subplot(1,fig_num,i) plt.subplot(max_row,max_volumn,i) plt.title('Connectors') #for i in range(len(node_size_list)): # node_size_list[i] = node_size_list[i] / sum(node_size_list) * 100 nx.draw(G,pos=nx.spring_layout(G),with_labels=True,nodelist=nodes_list,node_size=node_size_list,node_color=node_color_list) plt.show()
from nltk.corpus import stopwords from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures set = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in set tokens=[t.lower() for t in webtext.words('grail.txt')] words=BigramCollocationFinder.from_words(tokens) words.apply_word_filter(stops_filter) print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words('chesterton-thursday.txt')) print("text9:", text9.name) def texts(): print("text1:", text1.name)
from nltk.corpus import webtext from nltk.corpus import nps_chat from nltk.corpus import brown from nltk.corpus import reuters from nltk.corpus import inaugural from nltk import word_tokenize from collections import Counter from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import pandas as pd # Pick out the first of these texts — Emma by Jane Austen — and give it a short name, gutenberg_raw gutenberg_raw = gutenberg.raw("austen-emma.txt") # Pick out the words from webtext corpus and give it a short name, webtext_words webtext_words = webtext.words() print(webtext_words) # Pick out the text from np_chat corpus and name it as nps_chat_raw nps_chat_raw = nps_chat.raw() # Pick out the text from brown corpus and name it as brown_raw brown_raw = brown.raw() print(brown_raw) # Pick out the text from reuters corpus and name it as reuters_words reuters_words = reuters.words() print(reuters_words) # Pick out the text from inaugural corpus and name it as inaugral_raw inaugral_words = inaugural.words()
from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.corpus import stopwords textwords = [w.lower() for w in webtext.words('pirates.txt')] finder = BigramCollocationFinder.from_words(textwords) finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) ignored_words = set(stopwords.words('english')) filterstops = lambda w: len(w) < 3 or w in ignored_words finder.apply_word_filter(filterstops) finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) finder.nbest(BigramAssocMeasures.likelihood_ratio, 15)
with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file: complaints = list(csv.reader(file)) file.close() compClean = [] for i in range(len(complaints)): tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5]) tokens = re.sub('!', ".", tokens) compClean.append(tokens) from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')] bcf = BigramCollocationFinder.from_words(words) #from nltk.collocations import TrigramCollocationFinder #from nltk.metrics import TrigramAssocMeasures #tcf = TrigramCollocationFinder.from_words(words) #tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4) from nltk.corpus import stopwords stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50) newText = 'a credit card is issued to me' tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)
''' from nltk.corpus import stopwords,webtext from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures from nltk.probability import FreqDist from nltk.book import text1 from pip._vendor.distlib.resources import finder from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores from nltk.corpus import stopwords, webtext scorer = BigramAssocMeasures.likelihood_ratio compare_scorer = BigramAssocMeasures.raw_freq ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer))))) ''' #from nltk.util import bigrams bigram_measures=BigramAssocMeasures() trigram_measure=BigramAssocMeasures() finder=BigramCollocationFinder.from_words("grail.txt")
text1 = Text(gutenberg.words('melville-moby_dick.txt')) print "text1:", text1.name text2 = Text(gutenberg.words('austen-sense.txt')) print "text2:", text2.name text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis") print "text3:", text3.name text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print "text4:", text4.name text5 = Text(nps_chat.words(), name="Chat Corpus") print "text5:", text5.name text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print "text6:", text6.name text7 = Text(treebank.words(), name="Wall Street Journal") print "text7:", text7.name text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print "text8:", text8.name text9 = Text(gutenberg.words('chesterton-thursday.txt')) print "text9:", text9.name def texts(): print "text1:", text1.name
macbethRaw = gutenberg.raw('shakespeare-macbeth.txt') macbethWords = gutenberg.words('shakespeare-macbeth.txt') macbethSents = gutenberg.sents('shakespeare-macbeth.txt') longestLen = max([len(s) for s in macbethSents]) longestSents = [s for s in macbethSents if len(s) == longestLen] from nltk.corpus import webtext webtext.fileids() for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65], '...' webtext.raw('pirates.txt').lower().count('jack') pirates = nltk.Text(webtext.words('pirates.txt')) from nltk.corpus import brown brown.categories() brown.words(categories = 'news') brown.words(fileids = ['cg22']) brown.words(fileids = ['cg22','ca16']) # Concatenates the two corpora into one. from nltk.corpus import brown newsText = brown.words(categories = 'news') fdist = nltk.FreqDist([w.lower() for w in newsText]) modals = ['can','could','may','might','must','will'] for m in modals: print m + ':', fdist[m],
dict_tmp[ele] = dictLen[ele] return dict_tmp # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Start Program # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # import the arguments length = sys.argv[1] char = sys.argv[2] print "" print 'START------------------------------------------------------' print "Aufgabe 5 (length=" + length + ", character='" + char + ")" print '-----------------------------------------------------------' text = webtext.words('grail.txt') print "Häufigstes Wort mit " + str(length) + " Buchstaben:" printresult(dictLen(text, length)) print "Häufigstes Wort mit Anfangsbuchstabe '" + char + "':" printresult(dictChar(text, char)) print "Häufigstes Wort mit " + str( length) + " Buchstaben und Anfangsbuchstabe '" + char + "':" printresult(dictLenChar(dictLen(text, length), char)) print 'END------------------------------------ lvthiessen | rolben' print ""
#Importing data import nltk from nltk.corpus import webtext from nltk.probability import FreqDist from nltk.corpus import stopwords import string nltk.download('webtext') wt_sentences = webtext.sents('firefox.txt') wt_words = webtext.words('firefox.txt') print(len(wt_sentences)) print(len(wt_words))
import re from random import shuffle from nltk.corpus import webtext from nltk.corpus import nps_chat from gensim.models.doc2vec import LabeledSentence, Doc2Vec gendered_terms = [ r'\bhe\b', r'\bhes', r'\bshe\b', r'\bshes\b', r'\bhis\b', r'\bher\b', r'\bbro\b', r'\bman\b', r'\bsir\b', r'\bdude\b', r'\bgirl\b', r'\bgirls\b', r'\blady\b', r'\bgurl\b', r'\bhims\b', r'\bhers\b', r'\bhisself\b', r'\bherself\b', r'\bman\b', r'\bwoman\b' ] dictionary_words = {} for x in nps_chat.words() + webtext.words(): dictionary_words[x] = True print(len(dictionary_words)) class LabeledLineSentence(object): def __init__(self, messages_dic, is_sample=True): self.documents = [] self.messages_dic = messages_dic self.is_sample = is_sample def __iter__(self): for user in self.messages_dic: if self.is_sample: for i in range(200):
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat from nltk.corpus import brown # for fileid in webtext.fileids(): # print fileid, webtext.raw(fileid)[:65] # for fileId in nps_chat.fileids(): # print fileId pirates = webtext.raw('pirates.txt') pirates_char = len(webtext.raw('pirates.txt')) pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) def lexical_div(un, total): return total/un print 'lexical diversity: ', lexical_div(uniqs, pirates_words) # brown_categories = brown.categories() # for genre in brown_categories: # print genre news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) # modal verbs