def __init__(self, *args, **kwargs): if 'element_class' in kwargs: self.element_class = kwargs['element_class'] del kwargs['element_class'] else: self.element_class = Essay CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
def main(): articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]') feats = {} trainfeats = [] testfeats = [] for cat in articles.categories(): wow = len([f for f in articles.fileids(cat)]) # such variable name print "for category", cat, ":", wow feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)] cutoff = wow - hold_back(wow) trainfeats.append(feats[cat][:cutoff]) testfeats.append(feats[cat][cutoff:]) train = [item for sublist in trainfeats for item in sublist] test = [item for sublist in testfeats for item in sublist] print 'train on %d instances, test on %d instances' % (len(train), len(test)) classifier = NaiveBayesClassifier.train(train) print 'accuracy:', nltk.classify.util.accuracy(classifier, test) classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :( # load with: # import pickle # f = open('my_classifier.pickle') # classifier = pickle.load(f) # f.close() with open('../data/classifier.pickle', 'wb') as f: pickle.dump(classifier, f)
def create_categorized_corpus(self, categories_directory): boolean_list = [] boolean_for_categories_test = '' reader = CategorizedPlaintextCorpusReader(categories_directory, r'\.txt.*wordtype_(\w+)', cat_pattern=r'\.txt.*wordtype_(\w+)') for category in reader.categories(): boolean_list.append(category != '') if False in boolean_list: boolean_for_categories_test = False else: boolean_for_categories_test = True return reader, boolean_for_categories_test
def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None; self.terms = None;
def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation, r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None self.terms = None
import nltk, random, string, os from nltk.collocations import * from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords bigram_measures = nltk.collocations.BigramAssocMeasures() #print reader.categories() for name in os.listdir("."): if os.path.isdir(name): reader = CategorizedPlaintextCorpusReader(name, r'.*\.txt', cat_pattern=r'(\w+)/*') # reader = CategorizedPlaintextCorpusReader(name, r'./raw_reviews/\.txt', cat_pattern=r'(\w+)/*') print reader.fileids() table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] #all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) finder = BigramCollocationFinder.from_words(filtered_words_nopunc) #scored = finder.score_ngrams(bigram_measures.raw_freq) #a = sorted(bigram for bigram, score in scored) finder.apply_freq_filter(3) a = finder.nbest(bigram_measures.pmi, 5) #b = finder.score_ngrams(bigram_measures.pmi) print a
categ_dict = { 'test_1.txt': 'Regulatory Update', 'test_2.txt': 'Press Release', 'test_3.txt': 'Regulatory Update', 'test_4.txt': 'Regulatory Update', 'test_5.txt': 'Stock Update', 'test_6.txt': 'Press Release', 'test_7.txt': 'Market Opinion' } art_i = [] class_i = [] #Conversion of Train Data into Single Input File corpus_root = 'Train_set' newcorpus = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') myfile = open('Input_Article_Data.csv', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n") for category in newcorpus.categories(): for fileid in newcorpus.fileids(category): #print fileid,category data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ") data_list = [data1, category] wr.writerow(data_list) myfile.close() #Reading of Train Data as Lists
#print(bigrams) cfd = nltk.ConditionalFreqDist(bigrams) print(cfd[word]) print(generate_model(cfd, word)) def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1. Construir Corpus texto categorizado locPT = 'ch02/ES' corpusPT = CategorizedPlaintextCorpusReader(locPT, '.*\.txt', cat_file="cat.txt") print(corpusPT.fileids()) print(corpusPT.categories()) print(corpusPT.words(categories='ciencia')) #print(corpusPT.raw()) vocab = set(w.lower() for w in corpusPT.words()) print('Tamanho Vocabulario:', len(vocab)) corpusCom = corpusPT.raw() corpusComList = corpusCom.split() print('Tamanho Total de palabras:', len(corpusComList)) # 2. Calcular medidas estadisticas simples '''
data = get_data() print(len(data)) evrth, maindict = tags_assignment(data) # Save new final dictionary as well as the mapping for categories-numbers listingssss = json.dumps(evrth) with open("FinalCleanJuly1.json", "w") as f: f.write(listingssss) dictionaries = json.dumps(maindict) with open("CorpusCatMapJuly1.json", "w") as f: f.write(dictionaries) #### This is IMPORTANT - CHOOSE ! ##### default is key2 #### Choose the label you want to have for naming! ### two options: ### 1) key1 with format: docID + _(i) where i numerated number of category e.g. -doc-_cr14021.txt ### 2) key2 with format country name + year + _(i) e.g. Albania2015_1.txt ### if you want to change--> line 90: "key2: taglist" to key1 ### line 121: filename=evrth[i]['key2'] to key1 create_corpus(evrth) #### Check if working reader = CategorizedPlaintextCorpusReader('corpusCategory/', r'\w+\d+_.*\.txt', cat_map=maindict) print(reader.categories()) #print all categories in a list print(reader.fileids(categories=['Fiscal'])) #check docIDs in fiscal category #Good reference - https://www.packtpub.com/books/content/python-text-processing-nltk-20-creating-custom-corpora #They have options for creating chunked (by words, sentences, paragraphs and even customized paragraphs) corpora, tagged corpora etc
from nltk.tokenize import LineTokenizer, RegexpTokenizer from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy from nltk.stem import SnowballStemmer from processor import Processor as Proc data_folder = './data' encoding = 'UTF8' language = 'italian' wordTok = RegexpTokenizer( r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))') sentTok = LineTokenizer() reader = CategorizedPlaintextCorpusReader(data_folder, r'SENTIPOLC-.*\.txt', cat_pattern=r'SENTIPOLC-(\w+)\.txt', encoding=encoding, word_tokenizer=wordTok, sent_tokenizer=sentTok) pos_tweets = reader.sents(reader.fileids('pos')) neg_tweets = reader.sents(reader.fileids('neg')) # Inspection rndP = random.randrange(len(pos_tweets)) rndN = random.randrange(len(neg_tweets)) print 'Pos:\n', pos_tweets[rndP:rndP + 3], '\nNeg:\n', neg_tweets[rndN:rndN + 3], '\n' # All lowercase pos_tweets = Proc.lowerize(pos_tweets) neg_tweets = Proc.lowerize(neg_tweets)
output = open('airline_review\\pos\\' + \ filename + '.txt', 'w') output.write(row['reviewcontent']) output.close() count_pos += 1 elif row['rating_overall'] in ("1", "2", "3", "4", "5") and count_neg < 15000: output = open('airline_review\\neg\\' + \ filename + '.txt', 'w') output.write(row['reviewcontent']) output.close() count_neg += 1 os.chdir("E:/Documents/GSU/Python Development/Unstructured Data/Team Project/machine_learning_text_analysis") reader = CategorizedPlaintextCorpusReader('./airline_review', r'.*\.txt', cat_pattern = r'(\w+)/*') # file name format # Positive reviews file ids pos_ids = reader.fileids('pos') # Negative reviews file ids neg_ids = reader.fileids('neg') '''Generating word feature list''' def word_feats(words): return dict([(word, True) for word in words]) '''Building positive and negative feature lists. Each item is the positive/negative word features for a review file''' pos_feat = [(word_feats(reader.words(fileids = f)), 'pos')
from nltk.corpus import stopwords from nltk_trainer.classification.featx import bag_of_words from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures import collections import pickle from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( '/home/arjun/nltk_data/health/diabetes', r'health.*?[0-9]+.txt', cat_pattern=r'health(.*?)[0-9]+.txt') #print reader.categories() #takes a corpus .. creates labelled feature sets def label_feats_from_corpus(corp, feature_detector=bag_of_words): label_feats = collections.defaultdict(list) for label in corp.categories(): for fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats #creates test and train features def split_label_feats(lfeats, split=0.75): train_feats = [] test_feats = [] for label, feats in lfeats.items():
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ( [(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)] ) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: " , len(vocab_fac) print "Fac Tokens: " , len(fac_words) print vocab_fac[:20] print "Par Vocab: " , len(vocab_par) print "Par Tokens: " , len(par_words) print vocab_par[:20] fd_par.plot(50)
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ([(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)]) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: ", len(vocab_fac) print "Fac Tokens: ", len(fac_words) print vocab_fac[:20] print "Par Vocab: ", len(vocab_par) print "Par Tokens: ", len(par_words) print vocab_par[:20] fd_par.plot(50)
features['contains({})'.format(bigram)] = (bigram in article_bigrams) article_words = set(article_words) for word in word_features: features['contains({})'.format(word)] = (word in article_words) return features if __name__ == '__main__': #set up path to data data_folder_name = sys.argv[1] data_path = os.path.join(os.getcwd(), '', data_folder_name) #make article object to read in files article = CategorizedPlaintextCorpusReader(data_path, r'.*\.*\.txt', cat_pattern=r'(\w+).*\.txt') #make list of all articles with labels based on what folder the file is in all_articles = [] for category in article.categories(): for fileid in article.fileids(category): #lowercases words and takes out stopwords process = list( w.lower() for w in list(article.words(fileid)) if w.isalpha() and w not in stopwords.words('english')) entry = [process, category] all_articles.append(entry) random.shuffle(all_articles)
i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True Label') plt.xlabel('Predicted Label') if __name__ == '__main__': print("\nStarting the Classifier. First, let's set everything up.") traincorpus_root = raw_input( "Please specify the location of the training data: ") # traincorpus_root = '/Users/taniamaldonado/PycharmProjects/corpora/humanin/train4' traincorpus = CategorizedPlaintextCorpusReader( traincorpus_root, r".*_.*\.txt", cat_pattern=r'(\w+)_.*\.txt') testcorpus_root = raw_input( "Please specify the location of the test data: ") # testcorpus_root = '/Users/taniamaldonado/PycharmProjects/corpora/humanin/test' testcorpus = CategorizedPlaintextCorpusReader(testcorpus_root, r".*_.*\.txt", cat_pattern=r'(\w+)_.*\.txt') try: traindata, testdata = datainput(traincorpus, testcorpus) except NameError: print "The training/test corpus is not defined, please check if the location is correct." print("\nPlease choose a classification algorithm:") print("1. Multinomial Naive Bayes")
########## CATEGORIZED CORPUS READER ############### from nltk.corpus import brown print brown.categories() print brown.tagged_sents(categories=['news']) from nltk.corpus.reader import CategorizedPlaintextCorpusReader as CPCR root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #Using cat_pattern reader=CPCR(root ,r'movie_.*\.txt' ,cat_pattern=r'movie_(\w+)\.txt') print reader.categories() print reader.fileids(categories=['neg']) print reader.fileids(categories=['pos']) #Using cat_map: a dictionary mapping a ""fileid arg"" to a ""list of category labels"" reader=CPCR(root ,r'movie_.*\.txt' ,cat_map={'movie_pos.txt':['pos'],'movie_neg.txt':['neg']}) print reader.categories() #Using cat file: it is a file containing mapping of fileid to category i.e. cats.txt #for more details refer brown corpus folder.
from textblob.classifiers import NaiveBayesClassifier from nltk.corpus.reader import PlaintextCorpusReader, CategorizedPlaintextCorpusReader from nltk.corpus import movie_reviews import nltk import random from BeautifulSoup import BeautifulSoup p = nltk.data.find('corpora/SecurityThreat-MaxEnt') reader = CategorizedPlaintextCorpusReader(p, r'.*\.txt', cat_pattern=r'(\w+)/*') from nltk import WordNetLemmatizer #Using Wordnet Lemmatizer wordnet_lemmatizer = WordNetLemmatizer() all_words = nltk.FreqDist(word for word in reader.words()) top_words = list(all_words)[:100] print(top_words) def word_feats(words): return {word: True for word in words if word in top_words} #def word_feats(words): #return dict([(wordnet_lemmatizer.lemmatize(word), True) for word in words]) # Generate all the files based on ThreatType. IdentityThreat = reader.fileids('IdentityThreat') InsiderThreat = reader.fileids('InsiderThreat') Malware = reader.fileids('Malware')
def fetch_news(dir): base = 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/{}/rss.xml' for category in ['world', 'technology']: rss = fp.parse(base.format(category)) for i, entry in enumerate(rss.entries): fname = '{0}_bbc_{1}.txt'.format(i, category) fname = os.path.join(dir, fname) if not dl.conf.file_exists(fname): store_txt(entry.link, fname, entry.title) if __name__ == "__main__": dir = os.path.join(dl.data.get_data_dir(), 'bbc_news_corpus') if not os.path.exists(dir): os.mkdir(dir) fetch_news(dir) reader = CategorizedPlaintextCorpusReader(dir, r'.*bbc.*\.txt', cat_pattern=r'.*bbc_(\w+)\.txt') printer = dl.log_api.Printer(nelems=3) printer.print('Categories', reader.categories()) printer.print('World fileids', reader.fileids(categories=['world'])) printer.print('Technology fileids', reader.fileids(categories=['technology']))
def __init__(self, *args, **kwargs): CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
import sys if (len(sys.argv) != 5): print( 'Usage: Pass Arguments for Input PDF path, Category-File Mapping Path, Input Links File and Output Sub-Category Update File' ) sys.exit(1) print("Input PDFs File Path " + sys.argv[1]) print("Category File Name and Path " + sys.argv[2]) print("Input Links File Name & Path is " + sys.argv[3]) print("SubCategory Update File is " + sys.argv[4]) reader = CategorizedPlaintextCorpusReader(sys.argv[1], r'.*\.txt', cat_file=sys.argv[2], cat_delimiter='|') # Access each file in the corpus. #for infile in sorted(reader.fileids()): # print (infile) # The fileids of each file. # #file = reader.open(infile) # #print (file.read().strip()) # Prints the content of the file #print(reader.fileids()) #print(reader.fileids(categories=['General'])) #print(reader.categories()) #print(reader.categories())
return BOW """ # Recurso de emociones CANADA emotions_dict = pd.read_csv("emolex.csv") emotions_dict = emotions_dict.set_index('Spanish (es)') # Recurso de emociones SEL sel_emotions_dict = pd.read_csv("SEL_full.txt", sep='\t', encoding = "ISO-8859-1") sel_emotions_dict = sel_emotions_dict.set_index('Palabra') """ # Lee corpus de tweets reader = CategorizedPlaintextCorpusReader('./', r'mex.*\.txt', cat_pattern=r'(\w+)/*') tweets_train = reader.raw('mex_train.txt').split('\n')[:-1] labels_train = reader.raw('mex_train_labels.txt').split('\n')[:-1] labels_train = list(map(int, labels_train)) tweets_val = reader.raw('mex_val.txt').split('\n')[:-1] labels_val = reader.raw('mex_val_labels.txt').split('\n')[:-1] labels_val = list(map(int, labels_val)) tweets_test = reader.raw('mex_test.txt').split('\n')[:-1] """ corpus_palabras = [] for doc in tweets_train: corpus_palabras += doc.split()
############################################### def getDirnames( path ) : dirList = [] for f in os.listdir( path ) : if not os.path.isfile( path ) : if not f == ".DS_Store" : dirList.append(f) return dirList ############################################### ############################################### ################# # TRAINING DATA # ################# train_reader = CategorizedPlaintextCorpusReader('./training_data', r'.*\_.*\.txt', cat_pattern=r'.*\_(\w+)\.txt') train_documents = [(list(train_reader.words(fileid)), category) for category in train_reader.categories() for fileid in train_reader.fileids(category)] random.shuffle(train_documents) #print train_documents train_documents_clean = [] for i in train_documents : cat = i[1] #print cat newList = [] for word in i[0] : #print j clean_word = word.encode('ascii', 'ignore').decode('ascii').encode('ascii', 'ignore') newList.append(clean_word)
__author__ = 'Piotr' from random import shuffle from pickle import dump import os from nltk import word_tokenize from nltk.corpus.reader import CategorizedPlaintextCorpusReader from text_processing.replacers import RegexpReplacer training = CategorizedPlaintextCorpusReader("Articles", r'.*\.txt', cat_pattern=r'(\w+)', encoding="utf-8") def print_corpus_info(): print("Training Corpus INFO") for category in training.categories(): print("Number of documents in {0:8} category: {1}".format(category, len(training.fileids(category)))) print("\n") def save_documents(documents, name): with open(os.path.join("Classifiers", name + ".pickle"), 'wb') as file_handler: dump(documents, file_handler) def get_training_documents(cut_off=0.75, save=False): train_set = [] test_set = []
corpusfile=open(corpusfolder+'/'+fname,'a') corpusfile.write(str(body)) corpusfile.close() except Exception as e: print('Error on :'+id_) corpusfile.close() os.remove(mydir+'\\'+fname) pass else: print('Empty File:'+id) CreateCorpusFromDataFrame(mydir,data_sample) my_corpus=CategorizedPlaintextCorpusReader(mydir,r'.*', cat_pattern=r'.*_(.*).txt') def preprocess(words, to_lowercase = True, remove_punctuation = True, remove_digits = True, remove_odd_chars = True, remove_stopwords=True, stem = True): if to_lowercase: words = [w.lower() for w in words] if remove_punctuation: words = [w for w in words if not (re.match(r'^\W+$', w) != None)] if remove_digits: words = [w for w in words if not w.replace('.','',1).isdigit()] if remove_odd_chars: words = [re.sub(r'[^a-zA-Z0-9_]','_', w) for w in words] if remove_stopwords:
import os import glob from nltk import NaiveBayesClassifier from nltk.corpus.reader import CategorizedPlaintextCorpusReader import nltk from nltk.corpus import wordnet as wn import sys Feature_Set={} training_directory= "reviews" Training_Corpus = CategorizedPlaintextCorpusReader(training_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*') testing_directory= "reviews" Testing_Corpus = CategorizedPlaintextCorpusReader(testing_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*') Training_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw()) Positive_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="pos")) Negative_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="neg")) Training_Vocabulary = nltk.FreqDist(w.lower() for w in Training_Corpus_Text) Positive_Vocabulary = nltk.FreqDist(w.lower() for w in Positive_Corpus_Text) Negative_Vocabulary = nltk.FreqDist(w.lower() for w in Negative_Corpus_Text) pos_den=float(len(Positive_Corpus_Text))+float(len(Positive_Vocabulary.keys())) neg_den=float(len(Negative_Corpus_Text))+float(len(Negative_Vocabulary.keys()))
import time import nltk import pickle import re from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/words/colls/', r'.*\.txt', cat_pattern=r'(\w+)\.txt') # Removing oversized collections: hathi, nypl; Also, chunking them out: # First batch represents what was completed on 4/10-4/11. colls = ["searches"] #colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard", # "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar", # "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt')) pickle.dump( words, open( "/media/storage/dpla-data/pickles/new/"+coll+"_words.p", "wb"))
r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data', r'treebank.chunk', tagset='en-brown') print(reader.chunked_words()) ## Word level structure print(reader.chunked_sents()) ## Sentence level structure print(reader.chunked_paras()) ## Paragraph level structure ## Reading classifed corpora ################## ## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ###### from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( '/Users/atul/nltk_data', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt' ) ## Easiest is to read files for different category reader.categories() reader.fileids(categories=['neg']) reader.fileids(categories=['pos']) reader.fileids()
from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.tokenize.casual import TweetTokenizer from normalization import normalizeTwitterWordsWithExtraFeatures, normalizeTwitterWordsWithNegationHandle import pickle, nltk tweetTokenizer = TweetTokenizer(reduce_len=True, preserve_case=True, strip_handles=False) corpus = CategorizedPlaintextCorpusReader('corpus/2-step/polar', r'(\w+)-tweet[0-9]+\.txt', cat_pattern=r'(\w+)-tweet[0-9]+\.txt', word_tokenizer=tweetTokenizer) normalizationFunction = normalizeTwitterWordsWithNegationHandle wordsTaggedToCategory = [] i = 1 for category in corpus.categories(): for fileid in corpus.fileids(category): words = corpus.words(fileids=[fileid]) normalizedWords = normalizationFunction(words) extraNormalizedWords = normalizeTwitterWordsWithExtraFeatures(words) wordsTagged = nltk.pos_tag(normalizedWords) wordsTaggedToCategory += [(wordsTagged, category)] print(i) i += 1 with open("wordsTaggedToCategory-polar", 'wb') as fileout: pickle.dump(wordsTaggedToCategory, fileout)
import nltk, random, string from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('./', r'.*\.txt', cat_pattern=r'(\w+)/*') print reader.categories() print reader.fileids() documents = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(documents) # Remove stopwords & punc from content table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) print all_words word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {}
def __init__(self, *args, **kwargs): self.annotation_word_tokenizer = RegexpTokenizer(r'(Agree|Disagree) Strongly|(Agree|Disagree) Somewhat|Never Addressed|No Opinion|[AC]-\w+|\d+-\d+|\w+|[^\w\s]+') CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
import nltk from nltk.corpus.reader import CategorizedPlaintextCorpusReader import random from BeautifulSoup import BeautifulSoup #Reading from custom created categorized corpora #categorized corpora will be categorized for topic, genre, polarity, etc. #In addition to the standard corpus interface, these corpora provide access to the list of categories #and the mapping between the documents and their categories (in both directions) # Access the categories using the categories() method d = nltk.data.find('corpora/SecurityThreat') reader = CategorizedPlaintextCorpusReader(d, r'.*\.txt', cat_pattern=r'(\w+)/*') from textblob.classifiers import NaiveBayesClassifier random.seed(1) train = [ ('Identity', 'IdentityThreat'), ('identity', 'IdentityThreat'), ('identities', 'IdentityThreat'), ('identity loss', 'IdentityThreat'), ('insider', 'InsiderThreat'), ('Malware', 'Malware'), ] # Categorized corpora Reader collect the respective words based on ThreatType ThreatTypes = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(ThreatTypes)
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation, r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None self.terms = None def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents def setStopWords(self, fileLocation=config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self, wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self, size=2000, featureSelection='PD', removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()] if removeStopWords: all_words = self.removeStopWords(all_words) all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self, document, sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else: for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document, category) in self.documents] def __setTermsPD__(self, size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {} negWord = {} for word in self.reader.words(categories=['pos']): inc(posWord, word.lower()) for word in self.reader.words(categories=['neg']): inc(negWord, word.lower()) wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10: # min total count wordScores[word] = 0.1 else: wordScore[word] = abs(posScore - negScore) / totalScore
{ "category": cat, "doc": doc, "tot_words": tot_words, "avg_char": avg_char, "sentences": sentences, "avg_word": avg_word, "most_common": common[0][0], "most_common_freq": common[0][1] } ) def clean_string(text): clean_text = text.lower() clean_text = re.sub('[^0-9a-zA-Z //]+', '', clean_text) return clean_text.strip() #create document properties corpus = CategorizedPlaintextCorpusReader( 'C:/Users/gavin_000/Python/texts', r'.*\.txt', cat_pattern=r'(\w+)/*' ) stop = stopwords.words('english') results = pd.DataFrame() for category in corpus.categories(): for document in corpus.fileids(category): doc_properties = create_document_properties(category, document) results = results.append(doc_properties, ignore_index=True) print results
#From 1491 of cookbook #This is how we would load in the customized corpus from nltk.corpus.reader import CategorizedPlaintextCorpusReader #reader = CategorizedPlaintextCorpusReader('.',r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') #reader = CategorizedPlaintextCorpusReader('.',r'movie_.*\.txt', cat_map={'movie_pos.txt':['pos'],'movie_next.txt':['neg']}) reader = CategorizedPlaintextCorpusReader('./nltk_data/custom_corpora/',r'content_.*\.txt', cat_map={'content_good.txt':['good'],'content_bad.txt':['bad']}) reader.categories() #['neg','pos'] reader.fileids(categories=['good']) #['movie_neg.txt'] reader.fileids(categories=['bad']) #['movie_pos.txt'] #location 3442 #extract features from the corpus def bag_of_words(words): return dict([(word, True) for word in words]) def bag_of_words_not_in_set(words, badwords): return bag_of_words(set(words) - set(badwords)) from nltk.corpus import stopwords def bag_of_non_stopwords(words, stopfile='english'): badwords = stopwords.words(stopfile) return bag_of_words_not_in_set(words, badwords) from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures
] for topic in topics: statuses = Cursor(api.search, q=f"{topic} -filter:retweets", tweet_mode="extended").items(200) for status in statuses: if status.lang == "en": file = open( f"C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/tweets_{topic}.txt", "a", encoding="utf-8") file.write(status.full_text) file.close() reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stopword_reader = PlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/", r'.*\.txt', encoding='latin-1') stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"]) for file in stopword_reader.fileids(): stops = stopword_reader.raw(file).replace("\n", ",").split(",") for word in stops: stop_words.add(word) # text wrangling functions:
#!/usr/bin/env python # coding: utf-8 import nltk from nltk.corpus.reader import CategorizedPlaintextCorpusReader corpus_root = '/Users/athessen/nltk_data/corpora/eco' reader = CategorizedPlaintextCorpusReader(corpus_root,r'lion|shark\d*\.txt',cat_file='cats.txt') print reader.fileids() print reader.categories() """ all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] [1] def document_features(document): [2] document_words = set(document) [3] features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features """
auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = API(auth, wait_on_rate_limit=True) # setting limit to avoid upsetting Twitter '''accounts = [("NASA", 11348282), ("BarackObama", 813286)] for account in accounts: statuses = Cursor(api.user_timeline, user_id=account[1], include_rts=False, exclude_replies=True, count=10000, tweet_mode="extended").items() for status in statuses: if status.lang == "en": file = open(f"C:/Users/olgur/nltk_data/twitter_corpus/tweets_{account[0]}.txt", "a", encoding="utf-8") file.write(status.full_text.replace("\n", " ") + "\n") file.close()''' reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/nltk_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stop_words = set([ '“', '”', '’', ",", "#", "—", "__", "_", "___", ".", ":", '"', "?", "!", "-", ")", "(", "...", "$" ]).union(set(stopwords.words("english"))) def remove_links(text): http_regex = re.compile(r"(https|http)://.*") return http_regex.sub(r"", text) def remove_users(text):
import time import nltk import pickle import re from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/words/colls.oct/', r'.*\.txt', cat_pattern=r'(\w+)\.txt') # Removing oversized collections: hathi, nypl; Also, chunking them out: # First batch represents what was completed on 4/10-4/11. #colls = ["searches"] colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard", "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar", "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll", "hathi","nypl"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt'))
from nltk.corpus.reader import CategorizedPlaintextCorpusReader import nltk d = nltk.data.find('corpora/cookbook') reader = CategorizedPlaintextCorpusReader(d, r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') print(reader.categories()) print(reader.fileids(categories='neg')) print(reader.fileids(categories='pos')) # from nltk.corpus import brown # print(brown.categories())
from nltk.corpus import stopwords from nltk.tokenize import LineTokenizer, RegexpTokenizer from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy from nltk.stem import SnowballStemmer from processor import Processor as Proc data_folder = './data' encoding = 'UTF8' language = 'italian' wordTok = RegexpTokenizer(r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))') sentTok = LineTokenizer() reader = CategorizedPlaintextCorpusReader(data_folder, r'SENTIPOLC-.*\.txt', cat_pattern=r'SENTIPOLC-(\w+)\.txt', encoding=encoding, word_tokenizer=wordTok, sent_tokenizer=sentTok) pos_tweets = reader.sents(reader.fileids('pos')) neg_tweets = reader.sents(reader.fileids('neg')) # Inspection rndP = random.randrange(len(pos_tweets)) rndN = random.randrange(len(neg_tweets)) print 'Pos:\n', pos_tweets[rndP:rndP+3], '\nNeg:\n', neg_tweets[rndN:rndN+3], '\n' # All lowercase pos_tweets = Proc.lowerize(pos_tweets) neg_tweets = Proc.lowerize(neg_tweets)
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/' corpus1 = PlaintextCorpusReader(loc, '.*\.txt') print(corpus1.fileids()) print(corpus1.sents()) print(corpus1.words()) # Corpus texto etiquetado from nltk.corpus.reader.tagged import TaggedCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/' corpus2 = TaggedCorpusReader(loc, '.*\.txt') print(corpus2.fileids()) print(corpus2.words()) print("Palavras etiquetadas: ", corpus2.tagged_words()) print(corpus2.tagged_words('003.txt')) print("Sentencas diretas:") for s in corpus2.sents(): print(' '.join(s)) from nltk.corpus.reader import CategorizedPlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/' corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt") print(corpus3.fileids()) print(corpus3.categories()) print(corpus3.words(categories='brasnam')) # Definicao de stopwords stopwords = nltk.corpus.stopwords.words('portuguese') fd = nltk.FreqDist(w.lower() for w in corpus3.words()) fd1 = nltk.FreqDist(w.lower() for w in corpus3.words() if w.isalpha() and w not in stopwords)
#!pip install wordcloud # In[2]: from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords stopwordlist=stopwords.words('german') from wordcloud import WordCloud rootDir="../01access/GERMAN" filepattern=r"(?!\.)[\w_]+(/RSS/FeedText/)[\w-]+/[\w-]+\.txt" #filepattern=r"(?!\.)[\w_]+(/RSS/FullText/)[\w-]+/[\w-]+\.txt" catpattern=r"([\w_]+)/.*" rssreader=CategorizedPlaintextCorpusReader(rootDir,filepattern,cat_pattern=catpattern) # In[3]: singleDoc=rssreader.paras(categories="TECH")[0] print("The first paragraph:\n",singleDoc) print("Number of paragraphs in the corpus: ",len(rssreader.paras(categories="TECH"))) # In[4]: techdocs=[[w.lower() for sent in singleDoc for w in sent if (len(w)>1 and w.lower() not in stopwordlist)] for singleDoc in rssreader.paras(categories="TECH")] print("Number of documents in category Tech: ",len(techdocs))
@author: jagpr """ import collections, itertools import nltk.classify.util, nltk.metrics from nltk.metrics import * from nltk.classify import NaiveBayesClassifier from nltk.corpus import stopwords from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist #Creating Corpus using WordListCorpusReader from nltk.corpus.reader import CategorizedPlaintextCorpusReader imdb_reviews = CategorizedPlaintextCorpusReader( 'D://USF//Independent Research Project//Dataset//Movie Review Dataset Pos Neg//aclImdb//train//negpos', r'.*\.txt', cat_pattern=r'(\w+)/*') len(imdb_reviews.fileids()) def evaluate_classifier(featx): negids = imdb_reviews.fileids('neg') posids = imdb_reviews.fileids('pos') negfeats = [(featx(imdb_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(imdb_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats) * 3 / 4
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None; self.terms = None; def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents; def setStopWords(self,fileLocation = config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self,wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self,size=2000,featureSelection='PD',removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " elif featureSelection == 'SWNSS': self.__setTermsSWNSS__(size) print "Feature Selection : SWNPD :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()]; if removeStopWords: all_words = self.removeStopWords(all_words); all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self,document,sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else : for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document,category) in self.documents] def __setTermsPD__(self,size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {}; negWord = {}; for word in self.reader.words(categories = ['pos']): inc(posWord,word.lower()); for word in self.reader.words(categories = ['neg']): inc(negWord,word.lower()); wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10 : # min total count wordScores[word] = 0.1 else : wordScores[word] = abs(posScore-negScore)/totalScore #removeStopWords does no affect accurcy termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
# NLTK - train nb_classifier import random import nltk as nltk #nltk.download() from nltk.corpus import stopwords import os, os.path path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) os.path.exists(path) import nltk.data path in nltk.data.path from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader('.', r'.*_news_.*\.csv', cat_pattern=r'.*_news_(\w+)\.csv') reader.categories() def bag_of_words(words): return dict([(word, True) for word in words if word[0].isalpha()]) import collections def bag_of_words_not_in_set(words, badwords): return bag_of_words(set(words)-set(badwords)) def bag_of_non_stopwords(words, stopfile='english'): badwords = stopwords.words(stopfile) return bag_of_words_not_in_set(words, badwords) from nltk.metrics import BigramAssocMeasures from nltk.collocations import BigramCollocationFinder
import nltk as nltk import os, os.path path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) os.path.exists(path) import nltk.data path in nltk.data.path from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader2 = CategorizedPlaintextCorpusReader('.', r'news_.*\.csv', cat_pattern=r'news_(\w+)\.csv') reader.categories() reader.fileids(categories=['UP']) def bag_of_words(words): return dict([(word, True) for word in words]) import collections def label_feats_from_corpus(corp, feature_detector=bag_of_words): label_feats = collections.defaultdict(list) for label in corp.categories(): for fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats def split_label_feats(lfeats, split=0.75): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): cutoff = int(len(feats) * split) train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]])