def _make_qtype_data(verbose=False): data = {} for fileid in qc.fileids(): if verbose: print fileid for type_, sent in qc.tuples(fileid): type_ = type_.split(':')[0] if type_ not in data: data[type_] = {0: 0} counts = data[type_] tokens = _qtype_tokens(sent.split(' ')) for i, t in enumerate(tokens): counts[0] += 1 if t not in counts: counts[t] = {0: 0} counts[t][0] += 1 if i + 1 < len(tokens): if tokens[i + 1] not in counts[t]: counts[t][tokens[i + 1]] = {0: 0} counts[t][tokens[i + 1]][0] += 1 if i + 2 < len(tokens): if tokens[i + 2] not in counts[t][tokens[i + 1]]: counts[t][tokens[i + 1]][tokens[i + 2]] = 0 counts[t][tokens[i + 1]][tokens[i + 2]] += 1 return data
import nltk from nltk.corpus import qc import random import string s = qc.tuples() temp = [] bad=['the'] for x in string.punctuation: bad.append(x) for x,y in s: temp += nltk.word_tokenize(y) all_words = nltk.FreqDist(w.lower() for w in temp if w.isalpha() and w not in bad) word_features = all_words.keys()[:800] def qc_features(question): words = question.split() for val in bad: if val in words: words.remove(val) features = {} features['(Words are)'] = words[1]+' '+words[0] words=set(words) for word in words: features['contains(%s)' % word] = (word in words) return features
from nltk import ngrams from nltk.corpus import qc from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize #List of Categories :- Int -> 1, Dec -> 0 categoryInterrogative = "Int" categoryDeclarative = "Dec" # print(qc.raw("test.txt")) trainData = open("newTrainData30k.txt").read() trainData = nltk.sent_tokenize(trainData) qc_train = qc.tuples("train.txt") traindocuments = [x[1] for x in qc_train] trainData = trainData[:10000] qc_testInt = qc.tuples("test.txt") testdocuments = [x[1] for x in qc_testInt] testDec = open("RawTestingDataDeclarative.txt").read() testDec = nltk.sent_tokenize(testDec) def findFeatures(documents, isInterrogative): features = {} for sentence in documents: words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(words)
## try: ## for i in tokenized: ## words = nltk.word_tokenize(i) ## tagged = nltk.pos_tag(words) ## chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" ## chunkParser = nltk.RegexpParser(chunkGram) ## chunked = chunkParser.parse(tagged) ## chunked.draw() ## ## except Exception as e: ## print(str(e)) ## ##process_content() ##### Tutorial 11(text classification) ###### print(qc.tuples()[0][0]) documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((list(movie_reviews.words(fileid)), category)) random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000]
print(pos) print(reader.readline()) print(reader.seek(pos)) # rewind to the position from tell. print(reader.readline()) # squashed bugs f = StringIO(b""" (a b c) # This line is a comment. (d e f\ng h)""".decode('ascii')) print(read_sexpr_block(f, block_size=38, comment_char='#')) print(read_sexpr_block(f, block_size=38, comment_char='#')) f = StringIO(b""" This file ends mid-sexpr (hello (world""".decode('ascii')) for i in range(3): print(read_sexpr_block(f)) f = StringIO(b"This file has no trailing whitespace.".decode('ascii')) for i in range(3): print(read_sexpr_block(f)) # Bug fixed in 5279: f = StringIO(b"a b c)".decode('ascii')) for i in range(3): print(read_sexpr_block(f)) sents = nltk.corpus.brown.sents() print(sents[6000]) print(sents[6000]) print(reuters.words('training/13085')) print(reuters.words('training/5082')) nltk.download('qc') print(qc.tuples('test.txt'))