def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") assert isinstance(jeita.tagged_words()[0][1], str)
def make_classifier(): positive_file = 'positive_tweets.json' negative_file = 'negative_tweets.json' files = [positive_file, negative_file] twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, files, word_tokenizer=CustomTokenizer()) #this returns a list of lists twitter_tokens = twitter_samples.tokenized() #need to unpack our list of lists, using a nested list comprehension frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub) fequency_dist.pprint(100) master_list_of_words = tuple(requency_dist.keys()) extraction_function = make_extract_features_func(master_list_of_words) positive_tokens = twitter_samples.tokenized(positive_file) negative_tokens = twitter_samples.tokenized(negative_file) poistive_tokens = [(token, 'positive') for token in positive_tokens] negative_tokens = [(token, 'negative') for token in negative_tokens] all_tokens = positive_tokens + negative_tokens random.shuffle(all_tokens) training_set = nltk.classify.apply_features(extraction_function, all_tokens) classifier = NaiveBayesClassifier.train(training_set) return classifier, master_list_of_words
def ClassifierModel(): positive_file = 'positive_tweets.json' negative_file = 'negative_tweets.json' files = [positive_file, negative_file] twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, files, word_tokenizer=CustomTokenizer()) #this returns a list of lists twitter_tokens = twitter_samples.tokenized() #need to unpack the list of lists using nested list frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub) frequency_dist.pprint(200) master_list_of_words = tuple(frequency_dist.keys()) extraction_function = feature_extraction(master_list_of_words) positive_tokens = twitter_samples.tokenized(positive_file) negative_tokens = twitter_samples.tokenized(negative_file) positive_tokens = [(token, 'positive') for token in positive_tokens] negative_tokens = [(token, 'negative') for token in negative_tokens] all_tokens = positive_tokens + negative_tokens random.shuffle(all_tokens) #creating training set training_set = nltk.classify.apply_features(extraction_function, all_tokens) #creating a classifier bt calling the train method classifier = NaiveBayesClassifier.train(training_set) return classifier, master_list_of_words
def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") assert isinstance(jeita.tagged_words()[0][1], basestring)
def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') assert isinstance(jeita.tagged_words()[0][1], basestring)
def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') assert isinstance(jeita.tagged_words()[0][1], compat.string_types)
def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader( 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp') assert isinstance(knbc.words()[0], string_types) assert isinstance(knbc.sents()[0][0], string_types) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple)
def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp") assert isinstance(knbc.words()[0], basestring) assert isinstance(knbc.sents()[0][0], basestring) assert type(knbc.tagged_words()[0]) == tuple assert type(knbc.tagged_sents()[0][0]) == tuple
def main(): # matplotlib.use('Qt5Agg') # import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [['start0'] + [ word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence ] + ['end0'] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary = list(word_frequency_distribution) vocabulary_length = word_frequency_distribution.B() # Calculate bigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) # Calculate the conditional frequency distribution for bigrams bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train) # Calculate the conditional probability distribution for bigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) lower_case_letters = string.ascii_lowercase error_test = copy.deepcopy(test) for sentence in error_test: word = random.randrange(1, len(sentence) - 1) sentence[word] = random.choice(vocabulary) word = random.choice(sentence[1:-2]) word = random.randrange(1, len(sentence) - 1) letter = random.randrange(0, len(sentence[word])) sentence[word] = sentence[word][0:letter] + random.choice( lower_case_letters) + sentence[word][letter + 1:] corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram) print('Corrected:{}'.format(corrected)) print('Original:{}'.format(test[25]))
def loaddiff(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" self.logger.info("加载语料库 lazyload") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.logger.info("加载语料库 完毕")
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print knbc.fileids()[:10] print ''.join( knbc.words()[:100] ) print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ) print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] )
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") print "/".join(jeita.words()[22100:22140]) print "\nEOS\n".join( ["\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]] )
def print_corpus_information(corpus: LazyCorpusLoader, corpus_name: str) -> None: """ Prints information about an NLTK corpus e.g. the Brown corpus. :param corpus_name: :param corpus: the NLTK corpus in use. :return: None. """ print("Number of words in {} corpus = {}".format(corpus_name, len(corpus.words()))) print("Number of sentences in {} corpus = {}".format( corpus_name, len(corpus.tagged_sents(tagset='universal'))))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print('/'.join( jeita.words()[22100:22140] )) print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print '/'.join( jeita.words()[22100:22140] ) print '\nEOS\n'.join(['\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]])
def new_wordnet_instance(): """ Create a new wordnet instance. This is usefult for parallel workflows. Multiple processes cannot access the same wordnet instance (as when imported globally with `from wordnet.corpus import wordnet`). This is due nltk not being thread-safe. """ return LazyCorpusLoader( 'wordnet', WordNetCorpusReader, LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8') )
def __init__(self, languages=LangIDDict().keys()): self.language_trigrams = {} self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt') for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang+"-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) self.language_dicts = dict([ (id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()])) for id, fdist in self.language_trigrams.items() ])
def read_knbc(train_file, test_file, reference_file): root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') sentences = knbc.sents() write_train(sentences[0:4000], train_file) write_test(sentences[4000:-1], test_file) write_reference(sentences[4000:-1], reference_file)
def main(): # matplotlib.use('Qt5Agg') # import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [ ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence] + ['end0'] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary = list(word_frequency_distribution) vocabulary_length = word_frequency_distribution.B() # Calculate bigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) # Calculate the conditional frequency distribution for bigrams bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train) # Calculate the conditional probability distribution for bigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) lower_case_letters = string.ascii_lowercase error_test = copy.deepcopy(test) for sentence in error_test: word = random.randrange(1, len(sentence)-1) sentence[word] = random.choice(vocabulary) word = random.choice(sentence[1:-2]) word = random.randrange(1, len(sentence) - 1) letter = random.randrange(0, len(sentence[word])) sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:] corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram) print('Corrected:{}'.format(corrected)) print('Original:{}'.format(test[25]))
def parse_wsj(processes=8): ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt', tagset='wsj') fileids = ptb.fileids() params = [] for f in fileids: corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f)) for i, (parsed, tagged) in enumerate(corpus): params.append((f, i, parsed, tagged)) p = Pool(processes) p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") print("/".join(jeita.words()[22100:22140])) print("\nEOS\n".join("\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]))
def demo(**kwargs): import nltk from nltk_contrib.coref import NLTK_COREF_DATA from nltk_contrib.coref.muc import muc6_documents, muc7_documents from nltk_contrib.coref.muc import MUCCorpusReader nltk.data.path.insert(0, NLTK_COREF_DATA) muc6 = LazyCorpusLoader('muc6/', MUCCorpusReader, muc6_documents) for sent in muc6.iob_sents()[:]: for word in sent: print word print print for sent in muc6.mentions(depth=None): for mention in sent: print mention if sent: print print muc7 = LazyCorpusLoader('muc7/', MUCCorpusReader, muc7_documents) for sent in muc7.iob_sents()[:]: for word in sent: print word print print for sent in muc7.mentions(depth=None): for mention in sent: print mention if sent: print print
def load_data(): abc = LazyCorpusLoader( "abc", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding=[("science", "latin_1"), ("rural", "utf8")], ) raw = abc.sents() sentences = [] stopwords_ = list(stopwords.words('english')) final_stopwords = {w: 1 for w in stopwords_} for s in raw: words = [] for w in s: if w.isalpha() and w not in final_stopwords: words.append(w.lower()) sentences.append(words) word_counts = defaultdict(int) for sentence in sentences: for word in sentence: word_counts[word] += 1 vocabulary = list(word_counts.keys()) vocabulary.extend(["<START>", "<END>"]) vocab_size = len(vocabulary) word_to_num = {word: n for n, word in enumerate(vocabulary)} num_to_word = {n: word for n, word in enumerate(vocabulary)} sums = [-2, -1, 1, 2] training_data = [] for sentence in tqdm(sentences): length = len(sentence) for cur_index in range(length): cur_word = sentence[cur_index] context_vector = [] for diff in sums: index = cur_index + diff if index >= 0 and index < length: context_word = sentence[index] context_vector.append(context_word) if len(context_vector) == 4: training_data.append([context_vector, cur_word]) return vocab_size, vocabulary, word_to_num, num_to_word, training_data
def ham_corpus_maker(outpath, word): corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader, r'(?!\.).*\.xml') outfile = codecs.open(outpath, 'w', 'utf-8') count = 0 instancenum = 0 targetwordnum = 0 for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() cat = doc.getchildren()[3].text # text = doc.getchildren()[5].text newtext = correctPersianString(text) newtext = newtext.replace('\n', ' ') textlines = newtext.split('.') if word in newtext.split(): print newtext outfile.write(newtext) outfile.write('\n') print print print str(instancenum) + " seeds found " print str(targetwordnum) + " target word found " outfile.close()
class LangDetect(object): language_trigrams = {} langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt') def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']): for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang + "-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) def detect(self, text): ''' Detect the text's language ''' words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] def get_word_trigrams(self, match): return [ ''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None ]
def load_treebank(sections): treebank_path = os.environ.get('NLTK_TREEBANK', 'treebank/combined') treebank = LazyCorpusLoader( treebank_path, BracketParseCorpusReader, r'(%s\/)?wsj_%s.*\.mrg' % (sections, sections)) return treebank
def build_terms(self, terms): # save the original corpus corpus_temp = terms.kwargs["corpus"] groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path) terms.kwargs["corpus"] = LazyCorpusLoader( "c50_tags/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+') print "In ModeWeightClassCollocationPOS" cache_file = "%s.dat" % terms.name terms.tokens = [] shelf = shelve.open(cache_file, protocol=2) for author in terms.kwargs["corpus"].categories(): author_files = set(terms.kwargs["corpus"].fileids([author])) & set( terms.kwargs["source"]) author_files = list(author_files) if len(author_files) == 0: continue author_files.sort() #print "str(author_files): " + str(author_files) #print "str(terms.kwargs["corpus"]): " + str(terms.kwargs["corpus"]) + " str(terms.kwargs["corpus"].fileids([author])): " + str(terms.kwargs["corpus"].fileids([author])) + " str(terms.kwargs[\"source\"]): " + str(terms.kwargs["source"]) f_srcs = "|".join(author_files) terms.kwargs["string"] = \ terms.kwargs["corpus"].raw(fileids=author_files).lower() if f_srcs in shelf and terms.kwargs["lazy"]: terms.tokens += shelf[f_srcs] #print(str(f_src)) #print("%s ... Found in \"%s\"" % (f_src, cache_file)) else: terms.kwargs["string"] = \ terms.kwargs["corpus"].raw(fileids=author_files).lower() temp_tokens = terms.calc_terms() # because the latter function calc:terms get off this option, # but we still needed terms.kwargs["boolBuildSetGlobal"] = True terms.kwargs["mode"] = EnumModes.MODE_CORPUS_POS_GLOBAL_A ############################################################### terms.tokens += temp_tokens if terms.kwargs["lazy"]: shelf[f_srcs] = temp_tokens #print ("%s ... Recalculated in \"%s\"" % (f_src, cache_file)) terms.kwargs["boolBuildSetGlobal"] = False terms.kwargs["mode"] = EnumModes.MODE_CORPUS shelf.close() # restore the original corpus terms.kwargs["corpus"] = corpus_temp
def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs): if corpus == 'timit': return LazyCorpusLoader('timit', NumberedTaggedSentCorpusReader, '.+\.tags', tag_mapping_function=simplify_wsj_tag) real_corpus = getattr(nltk.corpus, corpus, None) if not real_corpus: if not reader: raise ValueError('you must specify a corpus reader') if not fileids: fileids = '.*' root = os.path.expanduser(corpus) if not os.path.isdir(root): if not corpus.startswith('corpora/'): path = 'corpora/%s' % corpus else: path = corpus try: root = nltk.data.find(path) except LookupError: raise ValueError('cannot find corpus path for %s' % corpus) reader_cls = import_attr(reader) real_corpus = reader_cls(root, fileids, **kwargs) return real_corpus
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp") print knbc.fileids()[:10] print "".join(knbc.words()[:100]) print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) knbc.morphs2str = lambda morphs: "/".join( "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
def read_knbc(train_file, test_file, reference_file): root = nltk.data.find('corpora/knbc/corpus1') fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') sentences = knbc.sents() write_train(sentences[0:4000], train_file) write_test(sentences[4000:-1], test_file) write_reference(sentences[4000:-1], reference_file)
def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es', 'th', 'pt', 'pl', "id", "ru", "it", "ru", "tr"]): logger.info("Build " + self.__class__.__name__ + " ... ") self.language_trigrams = {} self.langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt') self.__mutex = threading.Semaphore() for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang+"-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) logger.info("Build " + self.__class__.__name__ + ": done!")
def treebank_tagger_demo(): from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import PlaintextCorpusReader from nltk_contrib.coref.util import TreebankTaggerCorpusReader state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt') state_union = TreebankTaggerCorpusReader(state_union) print 'Treebank tagger demo...' print 'Tagged sentences:' for sent in state_union.tagged_sents()[500:505]: print sent print print print 'Tagged words:' for word in state_union.tagged_words()[500:505]: print word print
def from_nltk(cls): """Returns a fully populated Propbank with the help of NLTK's interface""" ptb = LazyCorpusLoader( 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt' ) propbank_ptb = LazyCorpusLoader( 'propbank', PropbankCorpusReader, 'prop.txt', 'frames/.*\.xml', 'verbs.txt', lambda filename: filename.upper(), ptb ) # Must be defined *after* ptb corpus. role_dict = {} for roleset_xml in propbank_ptb.rolesets(): role = Role.fromxml(roleset_xml) role_dict[role.roleset_id] = role instance_dict = defaultdict(dict) pb_instances = propbank_ptb.instances() for instance in pb_instances: instance.fileid = instance.fileid.lower() file_num = instance.fileid.split("/")[-1].split(".")[0].replace("wsj_", "") sentnum = str(instance.sentnum) predicate = instance.predicate tree = instance.tree if isinstance(predicate, nltk.corpus.reader.propbank.PropbankTreePointer): key = Propbank.pointer_to_word(predicate, tree) elif isinstance(predicate, nltk.corpus.reader.propbank.PropbankSplitTreePointer): key = tuple([Propbank.pointer_to_word(p, tree) for p in predicate.pieces]) else: ### TODO: Investigate when this is the case ### #assert False continue pb_instance = PropbankInstance(instance.fileid, file_num, sentnum, key, instance.roleset, instance.arguments) instance_dict[(file_num, sentnum)][key] = pb_instance return Propbank(role_dict, instance_dict)
def loadcorpus(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" pattern_2 = r".*/diff2/.*\.txt" pattern_3 = r".*/diff3/.*\.txt" from nltk.corpus.util import LazyCorpusLoader from nltk.corpus import PlaintextCorpusReader self.logger.info("加载语料库") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2) self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3) self.logger.info("加载完毕")
def treebank_chunk_tagger_demo(): from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import PlaintextCorpusReader from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt') state_union = TreebankChunkTaggerCorpusReader(state_union) print 'Treebank chunker demo...' print 'Chunked sentences:' for sent in state_union.chunked_sents()[500:505]: print sent print print print 'Parsed sentences:' for tree in state_union.parsed_sents()[500:505]: print tree print print
def calc_terms(self, kwargs, f_src): # save the original corpus corpus_temp = kwargs["corpus"] groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path) kwargs["corpus"] = LazyCorpusLoader("c50_term_SFM_23/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+') sfm_terms = Util.calc_SFM(kwargs["corpus"].raw(fileids=[f_src])) # restore the original corpus kwargs["corpus"] = corpus_temp return sfm_terms
class LangDetectTwitter(ModifiedMRJob): DEFAULT_INPUT_PROTOCOL = 'raw_value' language_trigrams = {} langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt') def configure_options(self): super(LangDetectTwitter, self).configure_options() #self.add_file_option('--langs', default='languages.txt') #def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']): def __init__(self, *args, **kwargs): super(LangDetectTwitter, self).__init__(*args, **kwargs) #languages = [x.strip() for x in open(self.options.langs, 'r').readlines()] languages = [ 'fr', 'en', 'ar', 'es', 'de', 'it', 'id', 'pt', 'tr', 'ru', 'nl', 'hi', 'sv', 'fi', 'da', 'pl', 'hu', 'fa', 'he', 'ur', 'th' ] for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang + "-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) def mapper(self, key, tweet): ''' Detect the text's language ''' obj = cjson.decode(tweet) text = obj['tx'] words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) obj['lang'] = sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] yield key, obj def get_word_trigrams(self, match): return [ ''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None ]
def corpus(self): """ This method is used to initialize the corpus object if it wasn't before """ if self._corpus is None: # The use of r'(?!\.).*\.txt' and =r'(neg|pos)/.*' makes possible to find the files labeled with neg and pos self._corpus = LazyCorpusLoader(self._corpusName, CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') return self._corpus
def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp") assert isinstance(knbc.words()[0], str) assert isinstance(knbc.sents()[0][0], str) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple)
def dictionary_backoff(option_tone, backoff): '''Creates a dictionary according to the option: tonal/nontonal''' if option_tone == "tonal": bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba.txt"], option_tone, "POS") bambara_dict_toolbox.copy_files() reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba.txt"]) entries = reader.entries("bamadaba.txt") #tonal words = reader.words("bamadaba.txt")#tonal pos = reader.words("bamadaba.txt", key="ps")#tonal else: bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba_non_tonal.txt"], option_tone, "POS") bambara_dict_toolbox.copy_files() reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba_non_tonal.txt"]) entries = reader.entries("bamadaba_non_tonal.txt") #tonal words = reader.words("bamadaba_non_tonal.txt")#tonal pos = reader.words("bamadaba_non_tonal.txt", key="ps")#tonal own_model = get_alt_pos(entries, pos, reader, option_tone)#tonal print("Dictionary created") dic = UnigramTagger(model=own_model, backoff=backoff) return dic
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join( "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print( "\n".join( " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
def hamshahri_targetword_corpus_maker(match, outpath): print 'loading hamshahri corpus' print corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader, r'(?!\.).*\.xml') outfile = codecs.open(outpath, 'w', 'utf-8') punclist = [u'،', u'؛', u':', u'؟', u'#'] matchnum = 0 count = 0 print 'creating target corpus' for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() # cat=doc.getchildren()[3].text# text = doc.getchildren()[5].text newtext = correctPersianString(text) newtext = newtext.replace('\n', ' ') for item in punclist: if item in newtext: newtext = newtext.replace(item, '') # # # print newtext # # if match in newtext.split(): # matchnum += 1 print newtext print '#' count += 1 # outfile.write(newtext) outfile.write('ALI') outfile.close() print count
class LangDetector(object): def __init__(self, languages=LangIDDict().keys()): self.language_trigrams = {} self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt') for lang in languages: self.language_trigrams[lang] = FreqDist() for f in self.langid.freqs(fileids=lang+"-3grams.txt"): self.language_trigrams[lang].inc(f[0], f[1]) self.language_dicts = dict([ (id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()])) for id, fdist in self.language_trigrams.items() ]) def detect(self, text): words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) trigcount = [(trigram, 1.0) for match in words for trigram in self.get_word_trigrams(match)] if len(trigcount) > 0: trigdf = pandas.DataFrame(trigcount, columns = ["key", "value"]) trigrams = trigdf.groupby("key")["value"].sum().to_dict() else: trigrams = {} total = sum(trigrams.values()) maxscore, maxid = 0, "" for trigram, count in trigrams.items(): trishare = (float(count) / float(total)) for lang, frequencies in filter(lambda (l, f): trigram in f, self.language_dicts.iteritems()): scores[lang] += frequencies[trigram] * trishare if scores[lang] > maxscore: maxid, maxscore = lang, scores[lang] return sorted(scores.items(), key=lambda x: x[1], reverse=True)
treebank_train_sequence = treebank_train.tagged_sents() treebank_test = load_treebank('24') treebank_test_sequence = treebank_test.tagged_sents() treebank_estimator = LidstoneProbDistFactory model = train_model(HiddenMarkovModelTagger, treebank_train_sequence, treebank_test_sequence, options.model_file, options.num_train_sents, options.num_test_sents, estimator=treebank_estimator, verbose=options.verbose) elif options.train_chunker: conll2k_train = LazyCorpusLoader( 'conll2000', ConllChunkCorpusReader, ['train.txt'], ('NP','VP','PP')) conll2k_train_sequence = conll2k_train.iob_sents() conll2k_test = LazyCorpusLoader( 'conll2000', ConllChunkCorpusReader, ['test.txt'], ('NP','VP','PP')) conll2k_test_sequence = conll2k_test.iob_sents() conll2k_estimator = LidstoneProbDistFactory conll2k_transform = ClosedCategoryChunkTransform(TREEBANK_CLOSED_CATS) model = train_model(HiddenMarkovModelChunkTagger, conll2k_train_sequence, conll2k_test_sequence, options.model_file, options.num_train_sents, options.num_test_sents, estimator=conll2k_estimator,
def main(): matplotlib.use('Qt5Agg') import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary_length = word_frequency_distribution.B() # Calculate bigrams and trigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3))) # Calculate the conditional frequency distributions for bigrams and trigrams bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train) trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train]) # Calculate the conditional probability distributions for bigrams and trigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length) bigrams_test = ngrams_sentences(test, 2) bigram_length_probabilities = defaultdict(list) for sentence in bigrams_test: logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) bigram_length_probabilities[len(sentence)].append(logprob) x = 0 s = None for sentence in bigrams_test: if (len(sentence) > x): x = len(sentence) s = sentence trigrams_test = ngrams_sentences(test, 3) trigram_length_probabilities = defaultdict(list) for sentence in trigrams_test: logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence] logprob = sum(logprob) trigram_length_probabilities[len(sentence)].append(logprob) average_bigram_length_probabilities = { length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in bigram_length_probabilities.keys()} average_trigram_length_probabilities = { length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length in trigram_length_probabilities.keys()} random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in bigram_length_probabilities.keys()] bigrams_random = ngrams_sentences(random_sentences, 2) random_bigram_length_probabilities = defaultdict(list) for sentence in bigrams_random: logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) random_bigram_length_probabilities[len(sentence)].append(logprob) trigrams_random = ngrams_sentences(random_sentences, 3) random_trigram_length_probabilities = defaultdict(list) for sentence in trigrams_random: logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence] logprob = sum(logprob) random_trigram_length_probabilities[len(sentence)].append(logprob) bigram = plt.scatter(list(average_bigram_length_probabilities.values()), list(average_bigram_length_probabilities.keys()), color='red') trigram = plt.scatter(list(average_trigram_length_probabilities.values()), list(average_trigram_length_probabilities.keys()), color='blue') random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()), list(random_bigram_length_probabilities.keys()), color='green') random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()), list(random_trigram_length_probabilities.keys()), color='black') plt.xlabel('$log_2(P(W_1^k))$') plt.ylabel('$k$') plt.legend((bigram, trigram, random_bigram, random_trigram), ('Bigram', 'Trigram', 'Random bigram', 'Random trigram')) plt.ylim(ymin=0) # plt.show() plt.savefig('logprob') seed = 'this' for i in range(30): newword = predict_word(cpd_bigram, seed, 'bigram') if newword != None: seed += ' ' + newword else: break print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed)) seed = 'this' for i in range(30): newword = predict_word(cpd_trigram, seed, 'trigram') if newword != None: seed += ' ' + newword else: break print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed)) test_bigrams = [] for sentence in bigrams_test: test_bigrams += sentence bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams) print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy, bigram_perplexity)) test_trigrams = [] for sentence in trigrams_test: test_trigrams += sentence trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams) print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy, trigram_perplexity))
from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import WordListCorpusReader reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist.txt']) print(isinstance(reader, LazyCorpusLoader)) print(reader.fileids()) print(isinstance(reader, LazyCorpusLoader)) print(isinstance(reader, WordListCorpusReader))
#!/usr/bin/env python # encoding: utf-8 # KNBCコーパスをNLTKで読み込むサンプル from nltk_jp import * from nltk.corpus.reader import * from nltk.corpus.util import LazyCorpusLoader def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) # コーパスを読み込み root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') #print "fileids :", knbc.fileids() print "words :", pp(knbc.words()[:10]) print "parsed_sents :", str(knbc.parsed_sents()[0]) print "tagged_words :", pp(knbc.tagged_words()[:5])
from nltk.tokenize import RegexpTokenizer from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import * import nltk.classify.util from nltk.classify import NaiveBayesClassifier import sys, os import cPickle from feats import words_in_sentence pathname = os.path.dirname(sys.argv[0]) nltk.data.path.append(os.path.abspath(pathname)+'/data'); movie_reviews = LazyCorpusLoader( sys.argv[1], CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') train_test_ratio = 2.0/3 def pickleObject(): obj = classifier savefile = open('classifier.pickle', 'w') cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL) def pickleFeats(): obj = words_in_sentence savefile = open('feats.pickle', 'w') cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)
if args.cat_file: reader_kwargs['cat_file'] = args.cat_file if args.delimiter and args.delimiter != ' ': reader_kwargs['delimiter'] = args.delimiter if args.cat_pattern: reader_args.append(args.cat_pattern) else: reader_args.append('.+/.+') elif args.cat_pattern: reader_args.append(args.cat_pattern) reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern) categorized_corpus = LazyCorpusLoader(args.corpus, reader_class[args.reader], *reader_args, **reader_kwargs) labels = categorized_corpus.categories() nlabels = len(labels) if args.trace: print '%d labels: %s' % (nlabels, labels) if not nlabels: raise ValueError('corpus does not have any categories') elif nlabels == 1: raise ValueError('corpus must have more than 1 category') elif nlabels == 2 and args.multi: raise ValueError('corpus must have more than 2 categories if --multi is specified') ######################## ## text normalization ##
For all words in top_words (most used in all corpus), set True/False if the word is in the document's words set. """ # transform the list of words into a set to optimize search doc_words_set = set(doc_words) # build features dictionary features = {} for word in top_words: features['contains(%s)' % word] = (word in doc_words_set) return features interrogazioni = LazyCorpusLoader( 'opp_interrogazioni_macro', CategorizedPlaintextCorpusReader, r'\d*', cat_file='cats.txt', cat_delimiter=',' ) print "computing FreqDist over all words" all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words()) top_words = all_words.keys()[:2000] print "generating list of documents for each category" documents = [ (list(interrogazioni.words(fileid)), category) for category in interrogazioni.categories() for fileid in interrogazioni.fileids(category) ] random.shuffle(documents)
from __future__ import division import sys import os.path sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import poetry import re from nltk.corpus import cmudict d = cmudict.dict() from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import * suffdict = LazyCorpusLoader( 'cmusuffdict', CMUDictCorpusReader, ['cmusuffdict']) suffdict = suffdict.dict() def suffdict_phonemes(word): # Use my cmu-based last syllable dictionary if re.search("((?i)[bcdfghjklmnpqrstvwxz]{1,2}[aeiouy]+[bcdfghjklmnpqrstvwxz]*(e|ed)?('[a-z]{1,2})?)(?![a-zA-Z]+)", word.lower()): last_syl = re.search("((?i)[bcdfghjklmnpqrstvwxz]{1,2}[aeiouy]+[bcdfghjklmnpqrstvwxz]*(e|ed)?('[a-z]{1,2})?)(?![a-zA-Z]+)", word.lower()).group() if last_syl in suffdict: return suffdict[last_syl][0] # else try without the first letter elif last_syl[1 - len(last_syl):] in suffdict: return suffdict[last_syl[1 - len(last_syl):]][0] # else try without the first 2 letters elif last_syl[2 - len(last_syl):] in suffdict: return suffdict[last_syl[2 - len(last_syl):]][0] # else try without the last 2 letters, if it ends in 's elif last_syl[-2:] == "'s": if last_syl[:-2] in suffdict:
#! /usr/bin/python # -*- coding: utf-8 -*- import nltk import util from knbc import * from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/KNBC_v1.0_090925/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('KNBC_v1.0_090925/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') # print knbc.fileids() # print '\n'.join( ''.join(sent) for sent in knbc.words() ) print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[0:2] ) print type(knbc.parsed_sents()[0]) # print '\n'.join( ' '.join("%s/%s"%(w[0], w[1][2]) for w in sent) for sent in knbc.tagged_words()[0:20] )
def loadClassifier(outputdir): classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") word_features_filename = os.path.join("pickled_algos", "word_features.pickle") if os.path.exists(classifier_filename) and os.path.exists(word_features_filename): word_features = pickleLoad("word_features.pickle") # classifier = pickleLoad("originalnaivebayes.pickle") # MNB_classifier = pickleLoad("MNB_classifier.pickle") # BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle") # LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle") # SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle") # LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle") # # voted_classifier = VoteClassifier(classifier, ## NuSVC_classifier, # LinearSVC_classifier, # SGDClassifier_classifier, # MNB_classifier, # BernoulliNB_classifier, # LogisticRegression_classifier) voted_classifier= pickleLoad("voted_classifier.pickle") return voted_classifier, word_features else: criticas_cine = LazyCorpusLoader( 'criticas_cine', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # criticas_cine = LazyCorpusLoader( # 'criticas_cine_neu', CategorizedPlaintextCorpusReader, # r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*', # encoding='utf-8') documents = [(list(criticas_cine.words(fileid)), category) for category in criticas_cine.categories() for fileid in criticas_cine.fileids(category)] # # document_pos = [(list(criticas_cine.words(fileid)), "pos") # for fileid in criticas_cine.fileids("pos")] # document_neg = [(list(criticas_cine.words(fileid)), "neg") # for fileid in criticas_cine.fileids("neg")] # document_neu = [(list(criticas_cine.words(fileid)), "neu") # for fileid in criticas_cine.fileids("neu")] random.shuffle(documents) # random.shuffle(document_pos) # random.shuffle(document_neg) # random.shuffle(document_neu) all_words = [] for w in criticas_cine.words(): all_words.append(w.lower()) # for w in criticas_cine.words(): # if not is_filtered(w.lower()): # all_words.append(w.lower()) # all_words = nltk.FreqDist(all_words) #print (all_words.most_common(50)) # Filtering by type of word # for sample in all_words: word_features = list(all_words.keys())[:3000] pickleDump(word_features, "word_features.pickle") featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents] # featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos] # featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg] # featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu] # training_set = featuresetpos[:1000] # training_set.extend(featuresetneg[:1000]) # training_set.extend(featuresetneu[:1000]) # testing_set = featuresetpos[1000:1273] # testing_set.extend(featuresetneg[1000:]) # testing_set.extend(featuresetneu[1000:]) # pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"] # neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"] # neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"] training_set = featuresets[:2000] testing_set = featuresets[2000:] classifier = nltk.NaiveBayesClassifier.train(training_set) # pickleDump(classifier, "originalnaivebayes.pickle") NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set) print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100) accuracy = Accuracy(classifier,testing_set) print(accuracy) # order: neu, neg, pos # print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3) # print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3) # print("Failed: ", (accuracy["neu"][1]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][1])/3) # print ("Pos:", nltk.classify.accuracy(classifier, pos_feat)*100) # print ("Neu:", nltk.classify.accuracy(classifier, neu_feat)*100) # print ("Neg:", nltk.classify.accuracy(classifier, neg_feat)*100) classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) MNB_classifierAccuracy = nltk.classify.accuracy(MNB_classifier, testing_set) print("MNB_classifier accuracy percent:", (MNB_classifierAccuracy)*100) # pickleDump(MNB_classifier, "MNB_classifier.pickle") BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierAccuracy = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", (BernoulliNB_classifierAccuracy)*100) # pickleDump(BernoulliNB_classifier, "BernoulliNB_classifier.pickle") LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) LogisticRegression_classifierAccuracy = nltk.classify.accuracy(LogisticRegression_classifier, testing_set) print("LogisticRegression_classifier accuracy percent:", (LogisticRegression_classifierAccuracy)*100) # pickleDump(LogisticRegression_classifier, "LogisticRegression_classifier.pickle") SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierAccuracy = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", (SGDClassifier_classifierAccuracy)*100) # pickleDump(SGDClassifier_classifier, "SGDClassifier_classifier.pickle") LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) LinearSVC_classifierAccuracy = nltk.classify.accuracy(LinearSVC_classifier, testing_set) print("LinearSVC_classifier accuracy percent:", (LinearSVC_classifierAccuracy)*100) # pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle") # SVC_classifier = SklearnClassifier(SVC()) # SVC_classifier.train(training_set) # print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(training_set) NuSVC_classifierAccuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set) print("NuSVC_classifier accuracy percent:", (NuSVC_classifierAccuracy)*100) # pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle") # pickleDump([NaiveBayesClassifierAccuracy, # LinearSVC_classifierAccuracy, # SGDClassifier_classifierAccuracy, # MNB_classifierAccuracy, # BernoulliNB_classifierAccuracy, # LogisticRegression_classifierAccuracy], "accuracies.pickle") voted_classifier = VoteClassifier([classifier,NaiveBayesClassifierAccuracy], [NuSVC_classifier,NuSVC_classifierAccuracy], [LinearSVC_classifier,LinearSVC_classifierAccuracy], [SGDClassifier_classifier,SGDClassifier_classifierAccuracy], [MNB_classifier,MNB_classifierAccuracy], [BernoulliNB_classifier,BernoulliNB_classifierAccuracy], [LogisticRegression_classifier,LogisticRegression_classifierAccuracy]) accuracy = Accuracy(voted_classifier,testing_set) print(accuracy) VoteClassifierAccuracy = nltk.classify.accuracy(voted_classifier, testing_set) print("VoteClassifier accuracy percent:", (VoteClassifierAccuracy)*100) # print ("Pos:", nltk.classify.accuracy(voted_classifier, pos_feat)*100) # print ("Neu:", nltk.classify.accuracy(voted_classifier, neu_feat)*100) # print ("Neg:", nltk.classify.accuracy(voted_classifier, neg_feat)*100) print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/2) print("Discarded: ", (accuracy["neu"][1]+accuracy["neg"][1]+accuracy["pos"][1])/2) print("Failed: ", (accuracy["neu"][0]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][0])/2) print("------------------------------------------"); pickleDump(voted_classifier, "voted_classifier.pickle") return voted_classifier, word_features
from nltk.tokenize import RegexpTokenizer, BlanklineTokenizer from xml.etree.ElementTree import ElementTree,Element from orthograph import detone orthographic_word = RegexpTokenizer(r"(\w+([-]\w+)*[']?|[.:;!?,])") test = LazyCorpusLoader( 'bamana/test', PlaintextCorpusReader, r'source.txt', word_tokenizer=orthographic_word, encoding='utf-8') wordlist = LazyCorpusLoader( 'bamana/wordlist', PlaintextCorpusReader, r'bailleul.clean.wordlist', word_tokenizer=orthographic_word, encoding='utf-8') properlist = LazyCorpusLoader( 'bamana/propernames', PlaintextCorpusReader, r'.*\.clean\.wordlist', word_tokenizer=orthographic_word, encoding='utf-8') propernames = LazyCorpusLoader( 'bamana/propernames', ToolboxCorpusReader, '.*\.txt', encoding='utf-8') bailleul = LazyCorpusLoader( 'bamana/bailleul', ToolboxCorpusReader, r'bailleul.txt', encoding='utf-8') lexicon = ElementTree(bailleul.xml('bailleul.txt')) for file in propernames.fileids(): for e in ElementTree(propernames.xml(file)).findall('record'): ge = Element('ge') ge.text = e.find('lx').text e.append(ge) ps = Element('ps') ps.text = 'n.prop' e.append(ps) lexicon.getroot().append(e)