def find_language_word(word): opts = [] # print(word) for fileid in udhr.fileids(): if word in udhr.words(fileid)[:len(udhr.words(fileid)/4)]: opts.append(fileid) return opts
def ch03_43_translate(): from nltk.corpus import udhr en_fd = bigram_freqdist(udhr.words("English-Latin1")) fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1")) de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1")) es_fd = bigram_freqdist(udhr.words("Spanish-Latin1")) inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"] for input in inputs: words = input.lower().split(" ") # TODO: remove keys present in reference set ranks = map(lambda x : nltk.spearman_correlation(x, bigram_freqdist(words)), [en_fd, fr_fd, de_fd, es_fd]) print input, ranks
def runLeaveOutWordTrialUnbiased(language): all_words = list(set(filterWords(udhr.words(language)))) test_set = random.choice(all_words) train_set = [w for w in all_words if w != test_set] ngrams = [(language, train_set)] for lang in LANGUAGES: if lang == language: continue ngrams.append((lang, udhr.words(lang))) classifier = NGramClassifier(N, ngrams) return [test_set], classifier.classifyWord(test_set)
def ch03_43_translate(): from nltk.corpus import udhr en_fd = bigram_freqdist(udhr.words("English-Latin1")) fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1")) de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1")) es_fd = bigram_freqdist(udhr.words("Spanish-Latin1")) inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"] for input in inputs: words = input.lower().split(" ") # TODO: remove keys present in reference set ranks = map( lambda x: nltk.spearman_correlation(x, bigram_freqdist(words)), [en_fd, fr_fd, de_fd, es_fd]) print input, ranks
def guess_language(samples): final_languages = {} for lang, str in samples.items(): tokens = word_tokenize(str) languages = [l for l in udhr.fileids() if 'Latin1' in l] languages_having_words = list() for token in tokens: for lang in languages: if token in udhr.words(lang) or token.lower() in udhr.words(lang): languages_having_words.append(lang) final_language = language_frequency(languages_having_words) final_languages[final_language[0]] = str return final_languages
def runLeaveOutWordTrialUnbiased(language): all_words = list(set(filterWords(udhr.words(language)))) test_set = random.choice(all_words) train_set = [w for w in all_words if w not in test_set] bigrams = [(language, makeTrigrams(train_set))] for lang in LANGUAGES: if lang == language: continue bigrams.append((lang, makeTrigrams(udhr.words(lang)))) grammars = makeTrigramGrammars(bigrams) return [test_set], predictLanguage(test_set, grammars)
def langToWord_ratio(text): tokens=wordpunct_tokenize(text) //把文档里面的内容分成单个词 #tokenize the document text docWords=[] //创建一个新的数组 #create empty list data sturucture called docWords for tokenToWord in tokens: #for each token put into variable called tokenToWord docWords.append(tokenToWord.lower()) //将所有的词变成小写并写进那个数组 #make all words put into tokenToWord lowercase and then append to the list docWords(puts it in at the end) # print("tokens is of variable type: ", type(tokens)) #"type" tells you the class of the variable given # print("words is of variable type: ", type(docWords)) # print("tokens: ",tokens) #print out all the tokens in document # print("Available languages: ", udhr.fileids()) #A udhr method called file ids brings back the list of languages available # print("\n") langRatios={} #create empty dictionary data structure -> key:value of key if len(udhr.fileids()) >0: for language in udhr.fileids(): //nltk还包含多国语言语料库。比如udhr,包含有超过300种语言的世界人权宣言。fileids:语料库中的文件,每一个语言都有自己的独立文件,language就是遍历这个语料库的各个不同语言的文档#for each language file, put into variable called language udhr_set=set(udhr.words(language)) // words(fileids=[f1,f2,f3]) 获得指定文件中的词汇, language是参数,获得这个language的词汇,然后通过set建立集合。例如此时language代表日文,获得udhr中的日文的词汇, 然后建立集合 #set of most used words in each language # print("language: ", language) # print(set(udhr.words(language))) # print("\n") docWords_set=set(docWords) //将文档中的词创建集合, #set of words from our document common_elements=docWords_set.intersection(udhr_set)。//寻找两个集合的交集,也就是相同的词。 #set of words that appear in both docWords_set and udhr_set if len(common_elements)>0: //如果存在相同的词汇 langRatios[language]=len(common_elements) //在languageration的数组里存入这个语言的词汇在文档中出现的频率 #for each language with atleast one common word = language:number of common words
def prep_language_corpus(fids): ### preps language corpus # pulls in all the languages, which udhr calls them the fileids) # fids = udhr.fileids() # makes a list of all the available languages that use Latin1 encoding. languages = [fileid for fileid in fids if re.findall('Latin1', fileid)] #pulls in all of the udhr for all diff. languages broken apart by characters. udhr_corpus = [[list(word.lower()) for word in udhr.words(language) if word.isalpha()] for language in languages] # flattens that list so that it is a clump of letters for each language udhr_corpus = [[item for sublist in language for item in sublist] for language in udhr_corpus] # gives the languages all indices. So you can pull in the text of the udhr by knowing its index number a la udhr_corpus[154] returns spanish languages = list(enumerate(languages)) # gets frequency distributions for all the characters in a list. then converts it to a ranked list language_freq_dists = [FreqDist(language) for language in udhr_corpus] language_ranks = [list(ranks_from_sequence(dist)) for dist in language_freq_dists] return languages, language_ranks
def Cal_Pred_Acc(charmodel, chardataset): model = LangModel(charmodel) words = udhr.words(chardataset)[0:1000] word_count = len(words) #Calculating the total number of words in a set unigram_acc = 0 bigram_acc = 0 trigram_acc = 0 for word in words: uni_pred = model.cal_unigram(word) if (uni_pred > 0): unigram_acc = unigram_acc + 1 print("%15s - %19.18f" % (word, uni_pred)) print("\nAccuracy of unigram model: ", unigram_acc * 100 / word_count, '%', '\n') for word in words: bi_pred = model.cal_bigram(word) if (bi_pred > 0): bigram_acc = bigram_acc + 1 print("%15s - %19.18f" % (word, bi_pred)) print("\nAccuracy of bigram model: ", bigram_acc * 100 / word_count, '%', '\n') for word in words: tri_pred = model.cal_trigram(word) if (tri_pred > 0): trigram_acc = trigram_acc + 1 print("%15s - %19.18f" % (word, tri_pred)) print("\nAccuracy of trigram model: ", trigram_acc * 100 / word_count, '%', '\n')
def Accuracy(LangModel, Data): model = Models(LangModel) words = udhr.words(Data)[0:1000] WordCount = len(words) UniAcc = 0 BiAcc = 0 TriAcc = 0 for word in words: UniP = model.CalUni(word) if (UniP > 0): UniAcc += 1 print("%15s - %19.18f" % (word, UniP)) print("\t\t\t\t\t\tAtccuracy of unigram model: ", UniAcc * 100 / WordCount) for word in words: BiP = model.CalBi(word) if (BiP > 0): BiAcc += 1 print("%15s - %19.18f" % (word, BiP)) print("\t\t\t\t\t\tAccuracy of bigram model: ", BiAcc * 100 / WordCount) for word in words: TriP = model.CalTri(word) if (TriP > 0): TriAcc += 1 print("%15s - %19.18f" % (word, TriP)) print("\t\t\t\t\t\tAccuracy of trigram model: ", TriAcc * 100 / WordCount)
def languages_freq(langlist, input_text): fdistinput = nltk.FreqDist(input_text) result = [] for language in Latin1_langs: Lang_freqdist = nltk.FreqDist(udhr.words(language)) result.append([language,nltk.spearman_correlation(Lang_freqdist,fdistinput)]) return result
def fun14(): """cfd plot""" languages = ['Chickasaw', 'English', 'German_Deutsch', \ 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist((lang, len(word)) \ for lang in languages \ for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True)
def find_language(str): for language in languages: lexicon = udhr.words(fileids=language) print(lexicon) for latin_language in lexicon: if str in latin_language: print(latin_language)
def test_words(self): for name in udhr.fileids(): try: words = list(udhr.words(name)) except AssertionError: print(name) raise self.assertTrue(words)
def runLeaveOutWordTrialUnbiased(language): """ Choses a single word to exclude from the types of the UDHR and then tests against that """ all_words = list(set(filterWords(udhr.words(language)))) test_set = random.choice(all_words) train_set = [ w for w in all_words if w not in test_set ] bigrams = [ (language, makeBigrams(train_set)) ] for lang in LANGUAGES: if lang == language: continue bigrams.append( ( lang, makeBigrams(udhr.words(lang)) ) ) grammars = makeBigramGrammars(bigrams) return [ test_set ], predictLanguage(test_set, grammars)
def get_udhr_word_length_cdf(): languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Malay_BahasaMelayu' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True)
def setUp(self): languages = ['English', 'German_Deutsch', 'French_Francais'] # udhr corpus contains the Universal Declaration of Human Rights in over 300 languages language_base = dict((language, udhr.words(language + '-Latin1')) for language in languages) # build the language models self.langModeler = LangModeler(languages, language_base)
def print_udhr(): from nltk.corpus import udhr languages=['Chickasaw','English','German_Deutsch'] cfd=nltk.ConditionalFreqDist( (lang,len(word)) for lang in languages for word in udhr.words(lang+'-Latin1') ) cfd.plot(cumulative=True)
def get_word_set(lang,talla): stop_words = set(stopwords.words(lang)) print('\n Stopwords: ',len(stop_words)) words = udhr.words(lang+'-Latin1') words = [w.lower() for w in words if not w in stop_words and w.isalpha()] fdist = nltk.FreqDist(words) words = fdist.most_common(talla) return words
def find_language(wordTested): latinLanguages = list() for language in udhr.fileids(): if 'Latin1' in language: latinLanguages.append(language) languageContains = list() for latinlanguage in latinLanguages: if wordTested in udhr.words(latinlanguage): languageContains.append(latinlanguage) return languageContains
def perform_experiment(modelFile, modelLanguage, dataFile, dataLanguage): languageModel = LanguageModel(modelFile) try: # Read test words words = udhr.words(dataFile)[0:1000] except: print("UDHR language file " + dataFile + " does not exist", file=sys.stderr) sys.exit(1) # All words in the test set countWords = len(words) # Words successfully predicted by unigram model unigramPredicted = 0 # Words successfully predicted by bigram model bigramPredicted = 0 # Words successfully predicted by trigram model trigramPredicted = 0 print("\n# Model: " + modelLanguage + ", Test Dataset: " + dataLanguage) print( "+----------------------+---------------------+---------------------+---------------------+" ) print( "| Word | Unigram Probability | Bigram Probability | Trigram Probability |" ) print( "|----------------------|---------------------|---------------------|---------------------|" ) for word in words: unigramProbability = languageModel.calculate_unigram_probability(word) if (unigramProbability > 0): unigramPredicted = unigramPredicted + 1 bigramProbability = languageModel.calculate_bigram_probability(word) if (bigramProbability > 0): bigramPredicted = bigramPredicted + 1 trigramProbability = languageModel.calculate_trigram_probability(word) if (trigramProbability > 0): trigramPredicted = trigramPredicted + 1 print( "| %20s | %19.17f | %19.17f | %19.17f |" % (word, unigramProbability, bigramProbability, trigramProbability)) print( "|----------------------|---------------------|---------------------|---------------------|" ) print("| %20s | %18.5f%% | %18.5f%% | %18.5f%% |" % ("Accuracy", unigramPredicted * 100 / countWords, bigramPredicted * 100 / countWords, trigramPredicted * 100 / countWords)) print( "+----------------------+---------------------+---------------------+---------------------+" )
def exercise_udhr(): print(udhr.fileids()) # 查看不同语言的世界人权宣言的字长差异 languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot()
def conditional_freq_dist(): from nltk.corpus import udhr languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def find_language(word): '''Returns list of languages for which word is found in Limitations: - currently only checks nltk.corpus.udhr (universal declaration of human rights, i.e., whether word is "universal", haha) - currently only checks Latin-1 languages in udhr ''' # trivial, like i said. right?? import string # to strip off punctuation - my little finishing touch return [ lang for lang in latin_languages if word.strip(string.punctuation) in set(udhr.words(lang)) ]
def guess_lang(text): '''Guess the language of the text. This version includes only Spanish, German and English and the sample needs to be quite big but it could be enhanced.''' Spanish = udhr.words('Spanish-Latin1') German = udhr.words('German_Deutsch-Latin1') English = udhr.words('English-Latin1') spanfd = fd(Spanish) small_spanfd = {} gerfd = fd(German) small_gerfd = {} enfd = fd(English) small_enfd = {} text_fd = fd(nltk.regexp_tokenize(text.lower(), r'\w+')) for key in spanfd.keys(): if text_fd.has_key(key): small_spanfd[key] = spanfd[key] for key in enfd.keys(): if text_fd.has_key(key): small_enfd[key] = enfd[key] for key in gerfd.keys(): if text_fd.has_key(key): small_gerfd[key] = gerfd[key] corwithspan = cor(small_spanfd, text_fd) corwithen = cor(small_enfd, text_fd) corwithger = cor(small_gerfd, text_fd) if abs(corwithspan) == abs(corwithen) == abs(corwithger): print "I don't know..." elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithspan): print "It's Spanish!" elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithen): print "It's English!" elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithger): print "It's German!"
def find_language(s): latin = [] final_langs = [] ct = 0 for id in udhr.fileids(): if '-Latin1' in id: latin.append(id) for lang in latin: for word in udhr.words(lang): if word == s: final_langs.append(lang) print "Found word: " + word, "Search word: " + s break return final_langs
def find_language(search_word): languages = [] for lang_id in udhr.fileids(): if 'Latin1' in lang_id: for word in udhr.words(lang_id): if search_word.lower() == word.lower(): languages.append(lang_id.split("-")[0]) break languages = set(languages) if languages: print("The word '", search_term, "' is in the following ", len(languages), "languages:", languages) else: print("no results found")
def multiLanguages(): nltk.corpus.cess_esp.words() nltk.corpus.floresta.words() nltk.corpus.indian.words('hindi.pos') nltk.corpus.udhr.fileids() nltk.corpus.udhr.words('Javanese-Latin1')[11:] languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True)
def ex25_findlanguage(): from nltk.corpus import udhr word_lang_map = {} for fileid in udhr.fileids(): if fileid.endswith("-Latin1"): lang = fileid[:-7] words = udhr.words(fileid) for word in words: try: word_lang_map[word] except KeyError: word_lang_map[word] = set() langs = word_lang_map[word] langs.add(lang) word_lang_map[word] = langs print word_lang_map["arashobora"]
def get_TTRs(languages): TTRs = {} for lang in languages: words = udhr.words(lang) ### BEGIN SOLUTION TTRs[lang] = [] for num in range(100, 1301, 100): seen = set() n_type = 0 for i in range(num): word = words[i].lower() if word not in seen: seen.add(word) n_type += 1 TTRs[lang].append(n_type) ### END SOLUTION return TTRs
def lengthTrial(): results = pd.DataFrame(columns=['Language', 'Length', 'Accuracy']) for lang in LANGUAGES: words_by_length = {} for word in set(filterWords(udhr.words(lang))): words_by_length[len(word)] = [word] + words_by_length.get( len(word), []) for l, words in words_by_length.items(): correct = 0 for w in words: result = predictLanguage(w, trigram_grammars) prediction = max(result)[1] correct += 1 if prediction == lang else 0 accuracy = correct / len(words) results.loc[str(l) + "-" + lang] = [lang, l, accuracy] return results
def fun3(): from nltk.corpus import udhr languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) # 在plot()和tabulate()方法中,可以使用conditions= 参数来指定显示哪些条件。如果我们忽略它,所有条 # 件都会显示出来。同样,可以使用samples= 参数来限制要显示的样本。这能将大量数据载入到一个条件频 # 率分布,然后通过选定条件和样品,对完成的绘图或制表进行探索。这也使我们能全面控制条件和样本的 # 显示顺序。例如:可以为两种语言和长度少于10个字符的词汇绘制累计频率数据表,如下所示。我们可以 # 解释最上排最后一个单元格中数值的含义是英文文本中9个或少于9个字符长的词有1638个 cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True) cfd.plot(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def tabulate(): cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def tabulate(): cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def lengthTrial(): """ A seperate trial that sorts all the words by length and then examines the accuracy on a per-length basis """ results = pd.DataFrame(columns=['Language', 'Length', 'Accuracy']) for lang in LANGUAGES: words_by_length = {} for word in set(filterWords(udhr.words(lang))): words_by_length[len(word)] = [ word ] + words_by_length.get(len(word), []) for l, words in words_by_length.items(): correct = 0 for w in words: result = predictLanguage(w, bigram_grammars) prediction = max(result)[1] correct += 1 if prediction == lang else 0 accuracy = correct / len(words) results.loc[str(l) + "-" + lang] = [ lang, l, accuracy ] return results
def udhr_rankings(debug=False): """ Get the conditional frequency distributions for each language in the udhr corpus. :returns: dictionary of language to conditional frequency distribution :rtype: dict """ result = dict() if debug: stdout.write('Preparing training sets') for _id in [s for s in udhr.fileids() if '-' in s]: split_id = _id.split('-') language = split_id[0] # Only allow some encodings. if udhr.encoding(_id) not in ENCODINGS: continue try: words = udhr.words(_id) result[language] = FreqDist(words) except AssertionError: # Problems reading, so we skip. pass except UnicodeDecodeError: # Problems reading, so we skip. pass if debug: stdout.write('.') stdout.flush() if debug: stdout.write('\n') return result
def __init__(self, languages): self._langs = languages self._language_base = dict((language, udhr.words(language + '-Latin1')) for language in languages) self._language_model_cfd = self.build_language_models()
import nltk from nltk.corpus import udhr cfd = nltk.ConditionalFreqDist( (word, lang) for lang in udhr.fileids() for word in udhr.words(lang)) def find_language(word): return cfd[word].max()
from nltk.corpus import udhr languages = ['Korean_Hankuko' , 'Japanese_Nihongo' , 'Vietnamese-ALRN' ] cfd = nltk.ConditionalFreqDist( (lang, len(word)) \ for lang in languages \ for word in udhr.words(lang + '-UTF8')) cfd.plot(cumulative=True)
# ◑ Download some text from a language that has vowel harmony (e.g. Hungarian), extract the vowel sequences of words, and create a vowel bigram table. import nltk from nltk.corpus import udhr # pulls in the universal declaration of human rights in hungarian text = udhr.words('Hungarian_Magyar-Latin1') def is_vowel(letter): """Checks to see if a letter is a vowel.""" if letter in "aeiou": return True else: return False def pull_out_vowels(word): """Takes in a word and pulls out all vowels for it.""" vowels = [] for letter in word: if is_vowel(letter): vowels.extend(letter) vowels = nltk.bigrams(vowels) return vowels def vowels_for_all_words(text): """pulls out all vowels for all words.""" vowels = [] for word in text: vowels.extend(pull_out_vowels(word)) return vowels
#!/usr/bin/python import nltk from nltk.book import * from nltk import FreqDist from nltk.corpus import gutenberg from nltk.corpus import udhr for fid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fid)) num_words = len(gutenberg.words(fid)) num_sents = len(gutenberg.sents(fid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fid))) print ('Average word length: %.1f Average sentence length: %.1f Lexical diversity: %.1f File: %s' % (float(num_chars)/num_words, float(num_words)/num_sents, float(num_words)/num_vocab, fid)) lang = 'English' cfd = nltk.ConditionalFreqDist((lang, len(word)) for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) names = nltk.corpus.names cfd = nltk.ConditionalFreqDist((fid, name[-1])for fid in names.fileids() for name in names.words(fid)) cfd.plot()
import nltk from nltk.corpus import udhr languages = ['Chinanteco-Ajitlan-Latin1', 'Chinanteco-UTF8', 'Chinese_Mandarin-GB2312'] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang)) cfd.plot(cumulative = True)
reuters.categories(['training/9865', 'training/9880']) reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words(['training/9865', 'training/9880']) reuters.words(categories=['barley', 'corn']) #演说语料库 from nltk.corpus import inaugural inaugural.fileids() #多国世界人权宣言 from nltk.corpus import udhr languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative = True) cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True) #条件频率分布 genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] cfd = nltk.ConditionalFreqDist(genre_word) cfd.conditions() list(cfd['romance']) cfd['romance']['could'] from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist( (target, fileid[:4])
def exercise_udhr(): print udhr.fileids() # 查看不同语言的世界人权宣言的字长差异 languages = ["Chickasaw", "English", "German_Deutsch", "Greenlandic_Inuktikut", "Hungarian_Magyar", "Ibibio_Efik"] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + "-Latin1")) cfd.plot()
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import udhr # contains the Universal Declaration of Human Rights in over 300 languages ################################################################## ## 简单测试 print(type(udhr)) # <class 'nltk.corpus.reader.udhr.UdhrCorpusReader'> print(len(udhr.fileids())) # 310 print(udhr.fileids()[:2]) # ['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8'] print([lang for lang in udhr.fileids() if lang.startswith('English')]) # ['English-Latin1'] print(len(udhr.words('English-Latin1'))) # 1781 print(udhr.words('English-Latin1')[:5]) # ['Universal', 'Declaration', 'of', 'Human', 'Rights'] languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] # 这些是常用语言
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] print(cvs[:10]) # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko'] cfd = ConditionalFreqDist(cvs) cfd.tabulate() # a e i o u # k 418 148 94 420 173 # p 83 31 105 34 51 # r 187 63 84 89 79 # s 0 0 100 2 1 # t 47 8 0 148 37 # v 93 27 105 48 49 ################################################################## ## 处理 udhr; 单词长度在不同语言的分布 languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) ################################################################## ## 画 男女名字结尾字母 区分图; It is well known that names ending in the letter a are almost always female. cfd = nltk.ConditionalFreqDist((fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)) cfd.plot() ################################################################## ## 利用 NLTK 预测单词 # 任务: 训练和创建一个单词预测器, 例如: 给定一个训练过语料库, 写一个能够预测给定单词的一下个单词的程序. # 使用这个预测器随机生成一个 20 个词的句子. # 要创建单词预测器, 我们首先要在训练过的语料库中计算两个词的顺序分布, 例如, 我们需要累加给定单词接下来这个单词的出现次数. # 一旦我们计算出了分布, 我们就可以通过输入一个单词, 得到它在语料库中所有可能出现的下一个单词列表, 并且可以从列表中随机输出一个单词. # 为了随机生成一个 20 个单词的句子, 我只需要给定一个初始单词, 利用预测器来预测下一个单词, 然后重复操作指导直到句子满 20 个词. # 清单 2 描述了怎么利用 NLTK 提供的模块来简单实现. 我们利用简奥斯丁的 Persuasion 作为训练语料库. def generate_model(cfdist, word, num=20):
def find_language(word): all_languages = [language for language in udhr.fileids() if language[-6:] == 'Latin1'] word_languages = [language for language in all_languages if word in udhr.words(language)] return word_languages