def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') for cat in docs.categories(): self.cat_gram_freq[cat] = {} self.cat_word_freq[cat]={} return ((category,list(docs.words(fileid))) for category in docs.categories() for fileid in docs.fileids(category))
def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') print docs.categories() documents = [(list(docs.words(fileid)), category) for category in docs.categories() for fileid in docs.fileids(category) ] random.shuffle(documents) return documents
import nltk from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader('ENGLISH', DOC_PATTERN, cat_pattern=CAT_PATTERN) print(corpus.categories()) print(corpus.fileids()[100:110]) print(corpus.words())
default=False, help='use bigrams') args = parser.parse_args() if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader( root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6 train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff] test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:] print 'training on %d instances, testing on %d instances' % ( len(train_feats), len(test_feats))
default=False, help='use bigrams') args = parser.parse_args() if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader(root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6 train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff] test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:] print 'training on %d instances, testing on %d instances' % ( len(train_feats), len(test_feats))
#Load Libraries import os import random from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader #Read the dataset into the categorized corpus # Directory of the corpus corpusdir = 'corpus/' review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt') # list of documents(fileid) and category (pos/neg) documents = [(list(review_corpus.words(fileid)), category) for category in review_corpus.categories() for fileid in review_corpus.fileids(category)] random.shuffle(documents) for category in review_corpus.categories(): print(category) type(review_corpus) len(documents) #Compute word frequency import nltk all_words = nltk.FreqDist(w.lower() for w in review_corpus.words()) word_features = list(all_words)[:200]
label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of BULATS dataset') plt.show() return model if __name__ == "__main__": PATH = "model.pickle" # Loading speech features speech = pd.read_csv("/ExamplePath.csv") if not os.path.exists(PATH): nli = CategorizedPlaintextCorpusReader(CORPUS, DOC_PATTERN, cat_pattern=CAT_PATTERN) # since `nli` already has all the information (text and ids) # you don't need to iterate over it multiple times so # construct `X` and `y` in one go. X = [] y = [] for fileid in nli.fileids(): X.append({ 'text': nli.raw(fileid), 'id': fileid.split('/')[-1].split('.')[0] }) y.append(nli.categories(fileid)[0]) clf = PCA(n_components=2) model = build_and_evaluate(X, y, clf, speech)
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader('corpus/text', DOC_PATTERN, cat_pattern=CAT_PATTERN) print(corpus.categories()) print(corpus.fileids('2019'))
return 1 in [c in str for c in set] def is_number(s): try: float(s) return True except ValueError: return False doc_lowercase = [w.lower() for w in doc] return lemma.lemmatize([w for w in doc_lowercase if not (is_number(w)) and len(w) > 1 and contains_any(w, wordchars) and not contains_any(w, exclude) and w not in stop]) doc_dict = {fid: clean(corpus.words(fid)) for cat in corpus.categories() for fid in corpus.fileids(cat)} # XXX docs = doc_dict.values() dictionary = gensim.corpora.Dictionary(docs) doc_ids = [k for k in doc_dict.keys()] doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs] bow_array = np.array(doc_term_matrix) def find_best_lda_model(texts, bow, id2word, min_n=min_topics, max_n=max_topics): best_model = None max_coherence = -1 for n in range(min_n, max_n + 1): ctm = CtmModel( bow, id2word=id2word, num_topics=n)
# create a corpus from the txt files given, with a file of categories to apply to the texts corpus = CategorizedPlaintextCorpusReader( 'corpus/', r'.*\.txt', cat_file="../textcats.prn") """ fileid="nytimes-2017.txt" raw = corpus.raw(fileid) raw = raw.replace("N.H.S.", "NHS") words = word_tokenize(raw) words = corpus.words(fileid) clean0 = [word for word in words if word not in stoplist] """ bloblist = corpus.fileids() #bloblist = corpus.fileids(categories='2016') M=len(bloblist) # Look at the categories corpus.categories() # for each file in the corpus for fileid in bloblist: raw = corpus.raw(fileid) raw = raw.replace("N.H.S.", "NHS") raw = raw.replace("per cent", "%") raw = raw.replace("votes", "vote") raw = raw.replace("voted", "vote") words = word_tokenize(raw)