def describe(corpus): print "\t".join(["c/w", "w/s", "w/v", "id"]) for fileid in corpus.fileids(): nchars = len(corpus.raw(fileid)) nwords = len(corpus.words(fileid)) nsents = len(corpus.sents(fileid)) nvocab = len(set([w.lower() for w in corpus.words(fileid)])) print "\t".join([str(nchars/nwords), str(nwords/nsents), str(nwords/nvocab), fileid])
def describe(corpus): print "\t".join(["c/w", "w/s", "w/v", "id"]) for fileid in corpus.fileids(): nchars = len(corpus.raw(fileid)) nwords = len(corpus.words(fileid)) nsents = len(corpus.sents(fileid)) nvocab = len(set([w.lower() for w in corpus.words(fileid)])) print "\t".join([ str(nchars / nwords), str(nwords / nsents), str(nwords / nvocab), fileid ])
def create_dfs(corpus): print("Gathering data..") hold_files = corpus.fileids() rowlist = [] for each in hold_files: each_row = {} each_row['Year'], each_row['Last_name'], _ = each.replace( '-', '.').split('.') each_row['Text'] = pre_process( corpus.raw(each)) # Preprocessed text file rowlist.append(each_row) print("Creating dataframe..") df = pd.DataFrame(rowlist) df['Year'] = df['Year'].astype(int) tf_idf_df = get_tfidf(df) return tf_idf_df, df
def _read_corpus(self, corpus, path): #Lists for sklearn documents = [] targets = [] j = 0 #print('Reading files') try: for cat in corpus.categories(): for doc in corpus.fileids(cat): documents.append(corpus.raw(doc)) targets.append(j) j += 1 except: j = 0 for cat in corpus.categories(): for doc in corpus.fileids(cat): raw_document = open(path + doc, errors='ignore') documents.append(raw_document.read()) targets.append(j) raw_document.close() j += 1 return documents, targets
corpusname = "inaugural" if len(sys.argv) >= 2: corpusname = sys.argv[1] filelim = 4 if len(sys.argv) >= 3: filelim = int(sys.argv[2]) corpus = getattr(nltk.corpus, corpusname) def mkdir_p(path): try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise path = "./%s" % corpusname mkdir_p(path) for i in range(0, filelim): fid = corpus.fileids()[i] with open("%s/%s" % (path, fid), 'w') as out: # need to remove new lines here so MR interprets each file # as a single input out.write(corpus.raw(fid).replace('\n', ' '))
root = make_testcorpus(ext='.txt', a=""" This is the first sentence. Here is another sentence! And here's a third sentence. This is the second paragraph. Tokenization is currently fairly simple, so the period in Mr. gets tokenized. """, b="""This is the second file.""") corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt']) print(corpus.fileids()) corpus = PlaintextCorpusReader(root, '.*\.txt') print(corpus.fileids()) print(str(corpus.root) == str(root)) print(corpus.words()) print(corpus.raw()[:40]) print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]) print(corpus.words('a.txt')) print(corpus.words('b.txt')) print(corpus.words()[:4], corpus.words()[-4:]) # del_testcorpus(root) for corpus in (abc, genesis, inaugural, state_union, webtext): print(str(corpus).replace('\\\\', '/')) print(' ', repr(corpus.fileids())[:60]) print(' ', repr(corpus.words()[:10])[:60]) root = make_testcorpus(a=""" This/det is/verb the/det first/adj sentence/noun ./punc Here/det is/verb another/adj sentence/noun ./punc Note/verb that/comp you/pron can/verb use/verb any/noun tag/noun set/noun