def demo(**kwargs): import nltk from nltk_contrib.coref import NLTK_COREF_DATA from nltk_contrib.coref.muc import muc6_documents, muc7_documents from nltk_contrib.coref.muc import MUCCorpusReader nltk.data.path.insert(0, NLTK_COREF_DATA) muc6 = LazyCorpusLoader('muc6/', MUCCorpusReader, muc6_documents) for sent in muc6.iob_sents()[:]: for word in sent: print word print print for sent in muc6.mentions(depth=None): for mention in sent: print mention if sent: print print muc7 = LazyCorpusLoader('muc7/', MUCCorpusReader, muc7_documents) for sent in muc7.iob_sents()[:]: for word in sent: print word print print for sent in muc7.mentions(depth=None): for mention in sent: print mention if sent: print print
def new_wordnet_instance(): """ Create a new wordnet instance. This is usefult for parallel workflows. Multiple processes cannot access the same wordnet instance (as when imported globally with `from wordnet.corpus import wordnet`). This is due nltk not being thread-safe. """ return LazyCorpusLoader( 'wordnet', WordNetCorpusReader, LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8') )
def loaddiff(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" pattern_2 = r".*/diff2/.*\.txt" pattern_3 = r".*/diff3/.*\.txt" self.logger.info("加载语料库 lazyload") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2) self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3) self.logger.info("加载语料库 完毕")
def make_classifier(): positive_file = 'positive_tweets.json' negative_file = 'negative_tweets.json' files = [positive_file, negative_file] twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, files, word_tokenizer=CustomTokenizer()) #this returns a list of lists twitter_tokens = twitter_samples.tokenized() #need to unpack our list of lists, using a nested list comprehension frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub) fequency_dist.pprint(100) master_list_of_words = tuple(requency_dist.keys()) extraction_function = make_extract_features_func(master_list_of_words) positive_tokens = twitter_samples.tokenized(positive_file) negative_tokens = twitter_samples.tokenized(negative_file) poistive_tokens = [(token, 'positive') for token in positive_tokens] negative_tokens = [(token, 'negative') for token in negative_tokens] all_tokens = positive_tokens + negative_tokens random.shuffle(all_tokens) training_set = nltk.classify.apply_features(extraction_function, all_tokens) classifier = NaiveBayesClassifier.train(training_set) return classifier, master_list_of_words
def build_terms(self, terms): # save the original corpus corpus_temp = terms.kwargs["corpus"] groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path) terms.kwargs["corpus"] = LazyCorpusLoader( "c50_tags/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+') print "In ModeWeightClassCollocationPOS" cache_file = "%s.dat" % terms.name terms.tokens = [] shelf = shelve.open(cache_file, protocol=2) for author in terms.kwargs["corpus"].categories(): author_files = set(terms.kwargs["corpus"].fileids([author])) & set( terms.kwargs["source"]) author_files = list(author_files) if len(author_files) == 0: continue author_files.sort() #print "str(author_files): " + str(author_files) #print "str(terms.kwargs["corpus"]): " + str(terms.kwargs["corpus"]) + " str(terms.kwargs["corpus"].fileids([author])): " + str(terms.kwargs["corpus"].fileids([author])) + " str(terms.kwargs[\"source\"]): " + str(terms.kwargs["source"]) f_srcs = "|".join(author_files) terms.kwargs["string"] = \ terms.kwargs["corpus"].raw(fileids=author_files).lower() if f_srcs in shelf and terms.kwargs["lazy"]: terms.tokens += shelf[f_srcs] #print(str(f_src)) #print("%s ... Found in \"%s\"" % (f_src, cache_file)) else: terms.kwargs["string"] = \ terms.kwargs["corpus"].raw(fileids=author_files).lower() temp_tokens = terms.calc_terms() # because the latter function calc:terms get off this option, # but we still needed terms.kwargs["boolBuildSetGlobal"] = True terms.kwargs["mode"] = EnumModes.MODE_CORPUS_POS_GLOBAL_A ############################################################### terms.tokens += temp_tokens if terms.kwargs["lazy"]: shelf[f_srcs] = temp_tokens #print ("%s ... Recalculated in \"%s\"" % (f_src, cache_file)) terms.kwargs["boolBuildSetGlobal"] = False terms.kwargs["mode"] = EnumModes.MODE_CORPUS shelf.close() # restore the original corpus terms.kwargs["corpus"] = corpus_temp
def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") assert isinstance(jeita.tagged_words()[0][1], str)
def load_treebank(sections): treebank_path = os.environ.get('NLTK_TREEBANK', 'treebank/combined') treebank = LazyCorpusLoader( treebank_path, BracketParseCorpusReader, r'(%s\/)?wsj_%s.*\.mrg' % (sections, sections)) return treebank
class LangDetect(object): language_trigrams = {} langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt') def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']): for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang + "-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) def detect(self, text): ''' Detect the text's language ''' words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] def get_word_trigrams(self, match): return [ ''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None ]
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join(knbc.words()[:100])) print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2])) print( '\n'.join( ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
def ham_corpus_maker(outpath, word): corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader, r'(?!\.).*\.xml') outfile = codecs.open(outpath, 'w', 'utf-8') count = 0 instancenum = 0 targetwordnum = 0 for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() cat = doc.getchildren()[3].text # text = doc.getchildren()[5].text newtext = correctPersianString(text) newtext = newtext.replace('\n', ' ') textlines = newtext.split('.') if word in newtext.split(): print newtext outfile.write(newtext) outfile.write('\n') print print print str(instancenum) + " seeds found " print str(targetwordnum) + " target word found " outfile.close()
def ClassifierModel(): positive_file = 'positive_tweets.json' negative_file = 'negative_tweets.json' files = [positive_file, negative_file] twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, files, word_tokenizer=CustomTokenizer()) #this returns a list of lists twitter_tokens = twitter_samples.tokenized() #need to unpack the list of lists using nested list frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub) frequency_dist.pprint(200) master_list_of_words = tuple(frequency_dist.keys()) extraction_function = feature_extraction(master_list_of_words) positive_tokens = twitter_samples.tokenized(positive_file) negative_tokens = twitter_samples.tokenized(negative_file) positive_tokens = [(token, 'positive') for token in positive_tokens] negative_tokens = [(token, 'negative') for token in negative_tokens] all_tokens = positive_tokens + negative_tokens random.shuffle(all_tokens) #creating training set training_set = nltk.classify.apply_features(extraction_function, all_tokens) #creating a classifier bt calling the train method classifier = NaiveBayesClassifier.train(training_set) return classifier, master_list_of_words
def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs): if corpus == 'timit': return LazyCorpusLoader('timit', NumberedTaggedSentCorpusReader, '.+\.tags', tag_mapping_function=simplify_wsj_tag) real_corpus = getattr(nltk.corpus, corpus, None) if not real_corpus: if not reader: raise ValueError('you must specify a corpus reader') if not fileids: fileids = '.*' root = os.path.expanduser(corpus) if not os.path.isdir(root): if not corpus.startswith('corpora/'): path = 'corpora/%s' % corpus else: path = corpus try: root = nltk.data.find(path) except LookupError: raise ValueError('cannot find corpus path for %s' % corpus) reader_cls = import_attr(reader) real_corpus = reader_cls(root, fileids, **kwargs) return real_corpus
def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') assert isinstance(jeita.tagged_words()[0][1], compat.string_types)
def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader( 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp') assert isinstance(knbc.words()[0], string_types) assert isinstance(knbc.sents()[0][0], string_types) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple)
def main(): # matplotlib.use('Qt5Agg') # import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [['start0'] + [ word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence ] + ['end0'] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary = list(word_frequency_distribution) vocabulary_length = word_frequency_distribution.B() # Calculate bigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) # Calculate the conditional frequency distribution for bigrams bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train) # Calculate the conditional probability distribution for bigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) lower_case_letters = string.ascii_lowercase error_test = copy.deepcopy(test) for sentence in error_test: word = random.randrange(1, len(sentence) - 1) sentence[word] = random.choice(vocabulary) word = random.choice(sentence[1:-2]) word = random.randrange(1, len(sentence) - 1) letter = random.randrange(0, len(sentence[word])) sentence[word] = sentence[word][0:letter] + random.choice( lower_case_letters) + sentence[word][letter + 1:] corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram) print('Corrected:{}'.format(corrected)) print('Original:{}'.format(test[25]))
def loadcorpus(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" pattern_2 = r".*/diff2/.*\.txt" pattern_3 = r".*/diff3/.*\.txt" from nltk.corpus.util import LazyCorpusLoader from nltk.corpus import PlaintextCorpusReader self.logger.info("加载语料库") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2) self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3) self.logger.info("加载完毕")
def calc_terms(self, kwargs, f_src): # save the original corpus corpus_temp = kwargs["corpus"] groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path) kwargs["corpus"] = LazyCorpusLoader("c50_term_SFM_23/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+') sfm_terms = Util.calc_SFM(kwargs["corpus"].raw(fileids=[f_src])) # restore the original corpus kwargs["corpus"] = corpus_temp return sfm_terms
def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp") assert isinstance(knbc.words()[0], str) assert isinstance(knbc.sents()[0][0], str) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple)
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print('/'.join( jeita.words()[22100:22140] )) print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]))
class LangDetectTwitter(ModifiedMRJob): DEFAULT_INPUT_PROTOCOL = 'raw_value' language_trigrams = {} langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt') def configure_options(self): super(LangDetectTwitter, self).configure_options() #self.add_file_option('--langs', default='languages.txt') #def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']): def __init__(self, *args, **kwargs): super(LangDetectTwitter, self).__init__(*args, **kwargs) #languages = [x.strip() for x in open(self.options.langs, 'r').readlines()] languages = [ 'fr', 'en', 'ar', 'es', 'de', 'it', 'id', 'pt', 'tr', 'ru', 'nl', 'hi', 'sv', 'fi', 'da', 'pl', 'hu', 'fa', 'he', 'ur', 'th' ] for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang + "-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) def mapper(self, key, tweet): ''' Detect the text's language ''' obj = cjson.decode(tweet) text = obj['tx'] words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) obj['lang'] = sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] yield key, obj def get_word_trigrams(self, match): return [ ''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None ]
def __init__(self, languages=LangIDDict().keys()): self.language_trigrams = {} self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt') for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang+"-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) self.language_dicts = dict([ (id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()])) for id, fdist in self.language_trigrams.items() ])
def corpus(self): """ This method is used to initialize the corpus object if it wasn't before """ if self._corpus is None: # The use of r'(?!\.).*\.txt' and =r'(neg|pos)/.*' makes possible to find the files labeled with neg and pos self._corpus = LazyCorpusLoader(self._corpusName, CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') return self._corpus
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") print("/".join(jeita.words()[22100:22140])) print("\nEOS\n".join("\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]))
def load_data(): abc = LazyCorpusLoader( "abc", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding=[("science", "latin_1"), ("rural", "utf8")], ) raw = abc.sents() sentences = [] stopwords_ = list(stopwords.words('english')) final_stopwords = {w: 1 for w in stopwords_} for s in raw: words = [] for w in s: if w.isalpha() and w not in final_stopwords: words.append(w.lower()) sentences.append(words) word_counts = defaultdict(int) for sentence in sentences: for word in sentence: word_counts[word] += 1 vocabulary = list(word_counts.keys()) vocabulary.extend(["<START>", "<END>"]) vocab_size = len(vocabulary) word_to_num = {word: n for n, word in enumerate(vocabulary)} num_to_word = {n: word for n, word in enumerate(vocabulary)} sums = [-2, -1, 1, 2] training_data = [] for sentence in tqdm(sentences): length = len(sentence) for cur_index in range(length): cur_word = sentence[cur_index] context_vector = [] for diff in sums: index = cur_index + diff if index >= 0 and index < length: context_word = sentence[index] context_vector.append(context_word) if len(context_vector) == 4: training_data.append([context_vector, cur_word]) return vocab_size, vocabulary, word_to_num, num_to_word, training_data
def read_knbc(train_file, test_file, reference_file): root = nltk.data.find('corpora/knbc/corpus1') fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') sentences = knbc.sents() write_train(sentences[0:4000], train_file) write_test(sentences[4000:-1], test_file) write_reference(sentences[4000:-1], reference_file)
def treebank_tagger_demo(): from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import PlaintextCorpusReader from nltk_contrib.coref.util import TreebankTaggerCorpusReader state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt') state_union = TreebankTaggerCorpusReader(state_union) print 'Treebank tagger demo...' print 'Tagged sentences:' for sent in state_union.tagged_sents()[500:505]: print sent print print print 'Tagged words:' for word in state_union.tagged_words()[500:505]: print word print
def treebank_chunk_tagger_demo(): from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import PlaintextCorpusReader from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt') state_union = TreebankChunkTaggerCorpusReader(state_union) print 'Treebank chunker demo...' print 'Chunked sentences:' for sent in state_union.chunked_sents()[500:505]: print sent print print print 'Parsed sentences:' for tree in state_union.parsed_sents()[500:505]: print tree print print
def hamshahri_targetword_corpus_maker(match, outpath): print 'loading hamshahri corpus' print corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader, r'(?!\.).*\.xml') outfile = codecs.open(outpath, 'w', 'utf-8') punclist = [u'،', u'؛', u':', u'؟', u'#'] matchnum = 0 count = 0 print 'creating target corpus' for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() # cat=doc.getchildren()[3].text# text = doc.getchildren()[5].text newtext = correctPersianString(text) newtext = newtext.replace('\n', ' ') for item in punclist: if item in newtext: newtext = newtext.replace(item, '') # # # print newtext # # if match in newtext.split(): # matchnum += 1 print newtext print '#' count += 1 # outfile.write(newtext) outfile.write('ALI') outfile.close() print count
def build_terms(self, terms): # save the original corpus corpus_temp = terms.kwargs["corpus"] groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path) terms.kwargs["corpus"] = LazyCorpusLoader( "c50_tagged/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+') #rint "In ModeCorpusPOS" cache_file = "%s.dat" % terms.name terms.tokens = [] shelf = shelve.open(cache_file, protocol=2) for f_src in terms.kwargs["source"]: if f_src in shelf and terms.kwargs["lazy"]: terms.tokens += shelf[f_src] #print(str(f_src)) #print("%s ... Found in \"%s\"" % (f_src, cache_file)) else: terms.kwargs["string"] = \ terms.kwargs["corpus"].raw(fileids=[f_src]).lower() temp_tokens = terms.calc_terms() terms.tokens += temp_tokens if terms.kwargs["lazy"]: shelf[f_src] = temp_tokens #print ("%s ... Recalculated in \"%s\"" % (f_src, cache_file)) shelf.close() # restore the original corpus terms.kwargs["corpus"] = corpus_temp
class LangDetect(object): language_trigrams = {} langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt', encoding='utf-8') tk = CharGramTokenizer() def __init__(self, languages): for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang + "-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) def detect(self, text): ''' Detect the text's language ''' if not isinstance(text, unicode): raise Exception('not unicode') trigrams = self.tk.tokenize(text) scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) best_match = sorted(scores.items(), key=lambda x: x[1], reverse=True)[0] if best_match[1] == 0: return ('other', 0) else: return best_match