def get_train_test_data(): ''' Load training and test set from nltk corpora ''' train_num = 3800 #Split the data into training and test set test_index = [0, 1, 4, 12, 16, 19, 21, 35, 37, 42, 43, 44, 45, 47, 54, 56, 62, 63, 65, 68, 71, 76, 79, 83] treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg') cnf_train = treebank.parsed_sents()[:train_num] cnf_test = [treebank.parsed_sents()[i+train_num] for i in test_index] #Convert to Chomsky norm form, remove auxiliary labels cnf_train = [convert2cnf(t) for t in cnf_train] cnf_test = [convert2cnf(t) for t in cnf_test] return cnf_train, cnf_test
def __init__(self, tokenizer: Optional[TokenizerI] = SimpleTokenizer(), detokenizer: Optional[TokenizerI] = TreebankWordDetokenizer(), stopwords: LazyCorpusLoader = stopw): self.tokenizer = tokenizer self.detokenizer = detokenizer self.stopwords = stopwords.words(tokenizer.language)
if ctb_in_nltk is None: eprint( 'You should run nltk.download(\'ptb\') to fetch some data first!') exit(1) ctb_in_nltk = join(ctb_in_nltk, 'corpora') ctb_in_nltk = join(ctb_in_nltk, 'ctb') print('Converting CTB: removing xml tags...') convert(args.ctb, ctb_in_nltk) print('Importing to nltk...\n') from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader ctb = LazyCorpusLoader('ctb', BracketParseCorpusReader, r'chtb_.*\.fid', tagset='unknown') # commented out splits are typically for dependency parsing, e.g. Zhang and Clark 2008 # training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1)) # development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1)) # test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1)) # splits for constituency parsing, see Petrov and Klein 2007, Liu and Zhang 2017 training = list(range(1, 270 + 1)) + list(range(440, 1151 + 1)) development = list(range(301, 325 + 1)) test = list(range(271, 300 + 1)) # make sure there's no overlap assert not (set(training) & set(development)) assert not (set(training) & set(test))
return output # def spell_word(word): # dic = enchant.Dict('pt_BR') # output = word # if len(word) > 0 and (not dic.check(word)): # sugestoes = dic.suggest(word) # if len(sugestoes) > 0: # output = sugestoes[0] # return output ## Inicio do Treinamento catho_treinamento = LazyCorpusLoader( 'catho_treinamento', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*') print "Preparando documentos para treinamento..." sys.stdout.flush() documents_treinamento = [(list(catho_treinamento.words(fileid)), category) for category in catho_treinamento.categories() for fileid in catho_treinamento.fileids(category)] print "fim da preparacao dos documentos de treinamento." sys.stdout.flush() ## Pre-processamento corpus_words = [w.lower() for w in catho_treinamento.words() if w not in string.punctuation] #if w not in string.punctuation and
return tree def tree_to_production(tree): return ProbabilisticProduction(get_tag(tree), [get_tag(child) for child in tree], **{'prob': 0}) def tree_to_productions(tree): yield tree_to_production(tree) for child in tree: if isinstance(child, Tree): for prod in tree_to_productions(child): yield prod treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg') def get_productions(productions): probabilities = dict() productions_to_return = list(set(productions)) for prod in productions: if str(prod) in probabilities: probabilities[str(prod)] += 1 else: probabilities[str(prod)] = 1 amount_of_interior_nodes = len([prod.lhs() for prod in productions if prod.lhs() != Nonterminal('S')])
import nltk from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader from DataCleaning.POSTagger import tagger from nltk.corpus.reader import sentiwordnet movie_reviews123 = LazyCorpusLoader('movie_reviews123', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos|neutral)/.*', encoding='ascii') print(movie_reviews123.categories()) print(nltk.tag._pos_tag())
import features import random import nltk from nltk.corpus import LazyCorpusLoader, stopwords from nltk.corpus.reader.plaintext import ( CategorizedPlaintextCorpusReader, ) decisions = LazyCorpusLoader( 'tweets_publish_choice', CategorizedPlaintextCorpusReader, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf8', ) # returns all non-stop words from corpus def get_top_words(): all_words = nltk.FreqDist( w.lower() for w in decisions.words() if not w.lower() in stopwords.words('english') ) word_features = list(all_words)[:500] nltk.FreqDist.pprint(all_words, 500) return word_features print(decisions.categories()) documents = [(list(decisions.words(fileid)), category) for category in decisions.categories() for fileid in decisions.fileids(category)]
x_axis.sort() y_axis = [ACCURACY_PER_DISTANCE[x]['matches']/float(ACCURACY_PER_DISTANCE[x]['total']) for x in x_axis] x_axis_labeled = ACCURACY_PER_DISTANCE_LABELED.keys() x_axis_labeled.sort() y_axis_labeled = [ACCURACY_PER_DISTANCE_LABELED[x]['matches']/float(ACCURACY_PER_DISTANCE_LABELED[x]['total']) for x in x_axis_labeled] print x_axis print y_axis import matplotlib.pyplot as plt plt.title("Accuracy per distance") plt.scatter(x_axis, y_axis, c="blue", marker='*', label="accuracy index") plt.scatter(x_axis_labeled, y_axis_labeled, c="red", marker='o', label="accuracy label", alpha=0.5) plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) plt.show() if __name__ == '__main__': treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg') trees = treebank.parsed_sents() trees = trees[:5] cleaned_trees = [filter_tree(tree) for tree in trees] for t in cleaned_trees: chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') parser, pcfg = get_parser(cleaned_trees) eval_trees(cleaned_trees, parser, pcfg) print "----------- Reporting Per Label -----------" print ACCURACY_PER_LABEL print len(ACCURACY_PER_LABEL) for item in ACCURACY_PER_LABEL: print item, "--- total -------> ", ACCURACY_PER_LABEL[item]['total']
import nltk.classify.util, nltk.metrics from nltk.classify import DecisionTreeClassifier from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader from nltk.corpus import stopwords from nltk.probability import FreqDist, ConditionalFreqDist from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() stopset = set(stopwords.words('english')) app_reviews = LazyCorpusLoader('app_reviews', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') def evaluate_classifier(featureX): negIds = app_reviews.fileids('neg') posIds = app_reviews.fileids('pos') negFeatures = [(featureX(app_reviews.words(fileids=[f])), 'neg') for f in negIds] posFeatures = [(featureX(app_reviews.words(fileids=[f])), 'pos') for f in posIds] #selects 3/4 of the features to be used for training and 1/4 to be used for testing
import nltk nltk.download() from nltk.book import * print text1 from nltk.text import Text from nltk.corpus import LazyCorpusLoader, PlaintextCorpusReader mytest = LazyCorpusLoader('mytest', PlaintextCorpusReader, r'(?!\.).*\.txt') tresh = Text(mytest.words('tresh.txt')) tresh.collocations() tresh.concordance('esto')
from nltk.classify.scikitlearn import SklearnClassifier from sklearn.externals import joblib from sklearn.metrics import classification_report from sklearn.svm import LinearSVC account_id = sys.argv[1] version = sys.argv[2] limit = int(sys.argv[3]) decisions = LazyCorpusLoader( 'tweets_publish_choice_{account}{s}{version}'.format( account=account_id, version=version, s=os.sep, ), CategorizedPlaintextCorpusReader, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf8', ) home = os.path.expanduser("~") path = os.path.join( home, 'nltk_data{s}corpora{s}tweets_publish_choice_{account}{s}{version}'.format( account=account_id, version=version, s=os.sep, ))
from nltk.corpus import LazyCorpusLoader, BracketParseCorpusReader from nltk.grammar import Production from nltk import Tree, Nonterminal treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg') def simplify_functional_tag(tag): if '-' in tag: tag = tag.split('-')[0] return tag def get_tag(tree): if isinstance(tree, Tree): if(tree.label() == '-NONE-') : # get rid of NONE tags return Nonterminal(tree[0].upper()) return Nonterminal(simplify_functional_tag(tree.label())) else: return tree def tree_to_production(tree): return Production(get_tag(tree), [get_tag(child) for child in tree]) def tree_to_productions(tree): yield tree_to_production(tree) for child in tree: if isinstance(child, Tree): for prod in tree_to_productions(child): yield prod def trees_to_productions(trees): productions = []
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk import FreqDist, BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures from nltk.classify.naivebayes import NaiveBayesClassifier from logger import logger from nltk.corpus import LazyCorpusLoader from nltk.text import TextCollection import nltk, random, operator, itertools my_corpus = LazyCorpusLoader( 'my_corpus', CategorizedPlaintextCorpusReader, '(data).*', cat_file='cats.txt') stopwords = nltk.corpus.stopwords.words() all_words = FreqDist(w.lower() for w in my_corpus.words() if w not in stopwords and len(w) > 2) #all_words_inf = {} #textCollection = TextCollection(my_corpus) #for word in all_words.keys()[:1000]: # score = 0 # for fileid in my_corpus.fileids(): # text = my_corpus.raw(fileid) # score += textCollection.tf_idf(word, text) # all_words_inf[word] = score #all_words = sorted(all_words_inf.items(), key=operator.itemgetter(1), reverse=False) word_features = [word for word in all_words.keys() if len(word) > 2][:2000] def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300): trigram_finder = TrigramCollocationFinder.from_words(words_in_document) trigrams = trigram_finder.nbest(score_fn, n)
# -*- coding: utf-8 -*- """ Created on Mon May 8 17:25:27 2017 This one is to use nltk default corpus loader to load our annotations @author: Janaka """ from nltk.corpus import LazyCorpusLoader, ConllChunkCorpusReader SOURCE_DIR = '../boi_pos_data/' boi_pos_data = LazyCorpusLoader(SOURCE_DIR, ConllChunkCorpusReader, [SOURCE_DIR + 'feedback_cs2012_2-result.txt'], ('T'), tagset='wsj', encoding='ascii') test_sents = boi_pos_data.chunked_sents('test.txt', chunk_types=['T']) print(test_sents)
for child in elt: cc = self.handle_word(child) if child.tag == 'instance': # id, lemma, pos, tkn inst.append((cc[:3])) # id, lemma, pos sent.append(cc[1:]) # lemma, pos, tkn else: sent.append(cc) # lemma, pos, tkn return inst, sent return [self.handle_word(child) for child in elt] def handle_word(self, elt): tkn = elt.text if not tkn: tkn = "" lemma = elt.get('lemma', tkn) pos = elt.get('pos') if elt.tag == 'instance': id = elt.get('id') if self._unit in ['instance', 'both']: return id, lemma, pos, tkn return lemma, pos, tkn new_semcor = LazyCorpusLoader('new_semcor', SemCorReader, r'(.*\.xml)|(.*\.gold\.key\.txt)', wordnet)
for root in nltk.data.path: if isdir(root): ctb_in_nltk = root if ctb_in_nltk is None: eprint( 'You should run nltk.download(\'ptb\') to fetch some data first!') exit(1) ctb_in_nltk = join(ctb_in_nltk, 'corpora') ctb_in_nltk = join(ctb_in_nltk, 'ctb') print('Converting CTB: removing xml tags...') convert(args.ctb, ctb_in_nltk) print('Importing to nltk...\n') from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader ctb = LazyCorpusLoader('ctb', BracketParseCorpusReader, r'chtb_\d{4}\.\w{2}', tagset='unknown') training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1)) development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1)) test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1)) root_path = args.output combine_fids(training, join(root_path, 'train.txt')) combine_fids(development, join(root_path, 'dev.txt')) combine_fids(test, join(root_path, 'test.txt'))