def main(): articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]') feats = {} trainfeats = [] testfeats = [] for cat in articles.categories(): wow = len([f for f in articles.fileids(cat)]) # such variable name print "for category", cat, ":", wow feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)] cutoff = wow - hold_back(wow) trainfeats.append(feats[cat][:cutoff]) testfeats.append(feats[cat][cutoff:]) train = [item for sublist in trainfeats for item in sublist] test = [item for sublist in testfeats for item in sublist] print 'train on %d instances, test on %d instances' % (len(train), len(test)) classifier = NaiveBayesClassifier.train(train) print 'accuracy:', nltk.classify.util.accuracy(classifier, test) classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :( # load with: # import pickle # f = open('my_classifier.pickle') # classifier = pickle.load(f) # f.close() with open('../data/classifier.pickle', 'wb') as f: pickle.dump(classifier, f)
'test_7.txt': 'Market Opinion' } art_i = [] class_i = [] #Conversion of Train Data into Single Input File corpus_root = 'Train_set' newcorpus = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') myfile = open('Input_Article_Data.csv', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n") for category in newcorpus.categories(): for fileid in newcorpus.fileids(category): #print fileid,category data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ") data_list = [data1, category] wr.writerow(data_list) myfile.close() #Reading of Train Data as Lists with open('Input_Article_Data.csv', 'r') as f: for line in f.readlines(): l, name = line.strip().split(',') l = (re.sub('[^A-Za-z0-9.]+', ' ', l)).lower() # l=porter_stemmer.stem(l) #Reduces Accuracy From 50% To 37% if (name != "Category"): art_i.append([l])
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/' corpus1 = PlaintextCorpusReader(loc, '.*\.txt') print(corpus1.fileids()) print(corpus1.sents()) print(corpus1.words()) # Corpus texto etiquetado from nltk.corpus.reader.tagged import TaggedCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/' corpus2 = TaggedCorpusReader(loc, '.*\.txt') print(corpus2.fileids()) print(corpus2.words()) print("Palavras etiquetadas: ", corpus2.tagged_words()) print(corpus2.tagged_words('003.txt')) print("Sentencas diretas:") for s in corpus2.sents(): print(' '.join(s)) from nltk.corpus.reader import CategorizedPlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/' corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt") print(corpus3.fileids()) print(corpus3.categories()) print(corpus3.words(categories='brasnam')) # Definicao de stopwords stopwords = nltk.corpus.stopwords.words('portuguese') fd = nltk.FreqDist(w.lower() for w in corpus3.words()) fd1 = nltk.FreqDist(w.lower() for w in corpus3.words() if w.isalpha() and w not in stopwords)
from nltk.corpus.reader import CategorizedPlaintextCorpusReader import nltk d = nltk.data.find('corpora/cookbook') reader = CategorizedPlaintextCorpusReader(d, r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') print(reader.categories()) print(reader.fileids(categories='neg')) print(reader.fileids(categories='pos')) # from nltk.corpus import brown # print(brown.categories())
tweet = remove_links(tweet) tweet = remove_users(tweet) tweet = remove_numbers(tweet) tweet = remove_hashtags(tweet) tweet = [term for term in tweet.split(" ") if term not in stop_words] if tweet: if counter < 5: prob_test_tweets[label].append((" ".join(tweet), original)) counter += 1 else: normalized_tweets.append((" ".join(tweet), label)) return normalized_tweets[: 1600] # Barack Obama has more tweets, so I'm making it even tweets_with_labels = normalize_tweets(reader.fileids(categories="BarackObama"), 0) + normalize_tweets( reader.fileids(categories="NASA"), 1) tweets = [tweet[0] for tweet in tweets_with_labels] labels = [tweet[1] for tweet in tweets_with_labels] tweets_train, tweets_test, labels_train, labels_test = train_test_split( tweets, labels, test_size=0.1, random_state=12) vectorizer = TfidfVectorizer() tweets_train = vectorizer.fit_transform(tweets_train) nb = MultinomialNB() nb.fit(tweets_train, labels_train) tweets_test = vectorizer.transform(tweets_test)
hashtag_free = remove_hashtags(number_free) twitter_words = [ term.lower() for term in tweet_tokenizer.tokenize(hashtag_free) if term.lower() not in stop_words ] twitter_words_with_hashtags = [ term.lower() for term in tweet_tokenizer.tokenize(number_free) if term.lower() not in stop_words ] return twitter_words, twitter_words_with_hashtags corpus_tokens = [] for category in reader.categories(): for file in reader.fileids(categories=category): without_hashtags, with_hashtags = tokenize_tweets(file) # c fdist_category = nltk.FreqDist(without_hashtags) print("Most common words in", category, ":", fdist_category.most_common(10)) # d hashtags = [word for word in with_hashtags if word.startswith("#")] fdist_category_hashtag = nltk.FreqDist(hashtags) print("Most common hashtags in", category, ":", fdist_category_hashtag.most_common(10)) corpus_tokens += without_hashtags
if remove_stopwords: sw = set(nltk.corpus.stopwords.words("english")) words = [w for w in words if not w in sw] if stem: porter = nltk.PorterStemmer() words = [porter.stem(w) for w in words] return words documents = [((fileid, category), preprocess(my_corpus.words(fileid), to_lowercase = True, remove_punctuation = True, remove_digits = True, remove_odd_chars = True, remove_stopwords=True, stem = False)) \ for category in my_corpus.categories() \ for fileid in my_corpus.fileids(category)] def dummy_fun(doc): return doc bow_gen = sklearn.feature_extraction.text.CountVectorizer( analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None, ngram_range=(1, 2), min_df = 150, # changed from 100 max_df = 0.85)
for f in os.listdir( path ) : if not os.path.isfile( path ) : if not f == ".DS_Store" : dirList.append(f) return dirList ############################################### ############################################### ################# # TRAINING DATA # ################# train_reader = CategorizedPlaintextCorpusReader('./training_data', r'.*\_.*\.txt', cat_pattern=r'.*\_(\w+)\.txt') train_documents = [(list(train_reader.words(fileid)), category) for category in train_reader.categories() for fileid in train_reader.fileids(category)] random.shuffle(train_documents) #print train_documents train_documents_clean = [] for i in train_documents : cat = i[1] #print cat newList = [] for word in i[0] : #print j clean_word = word.encode('ascii', 'ignore').decode('ascii').encode('ascii', 'ignore') newList.append(clean_word) newTup = (newList, cat) train_documents_clean.append(newTup) #print train_documents_clean
# words = map(lambda word: word, word_tokenize(text)); # words = [word for word in words # if word not in cachedStopWords] # tokens =(list(map(lambda token: PorterStemmer().stem(token), # words))); # p = re.compile('[a-zA-Z]+'); # filtered_tokens = list(filter(lambda token: # p.match(token) and len(token)>=min_length, tokens)); # # return filtered_tokens #Preparing a Tuple List of the Corpus Data based on #Words In the corpus file and correspoindg category data = [(list(tokenize(' '.join(reader.words(fileid)))), category) for category in reader.categories() for fileid in reader.fileids(category)] #First preparing a train data set based on pre-identified features featureListTrain = [ ('Natural Language Processing', 'General'), ('Text Retrieval', 'General'), ('Text Access', 'General'), ('Information Retrieval', 'General'), ('NLP', 'General'), ('Content Analysis', 'General'), ('Vector', 'IR Models & Implementations'), ('Length', 'IR Models & Implementations'), ('Indexing', 'IR Models & Implementations'), ('Statistical', 'IR Models & Implementations'), ('Evaluation', 'IR Models- Evaluation,Ranking & Feedback'), ('Feedback', 'IR Models- Evaluation,Ranking & Feedback'), ('Ranking', 'IR Models- Evaluation,Ranking & Feedback'), ('Recommender', 'IR Models- Evaluation,Ranking & Feedback'),
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ([(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)]) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: ", len(vocab_fac) print "Fac Tokens: ", len(fac_words) print vocab_fac[:20] print "Par Vocab: ", len(vocab_par) print "Par Tokens: ", len(par_words) print vocab_par[:20] fd_par.plot(50)
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ( [(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)] ) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: " , len(vocab_fac) print "Fac Tokens: " , len(fac_words) print vocab_fac[:20] print "Par Vocab: " , len(vocab_par) print "Par Tokens: " , len(par_words) print vocab_par[:20] fd_par.plot(50)
if __name__ == '__main__': #set up path to data data_folder_name = sys.argv[1] data_path = os.path.join(os.getcwd(), '', data_folder_name) #make article object to read in files article = CategorizedPlaintextCorpusReader(data_path, r'.*\.*\.txt', cat_pattern=r'(\w+).*\.txt') #make list of all articles with labels based on what folder the file is in all_articles = [] for category in article.categories(): for fileid in article.fileids(category): #lowercases words and takes out stopwords process = list( w.lower() for w in list(article.words(fileid)) if w.isalpha() and w not in stopwords.words('english')) entry = [process, category] all_articles.append(entry) random.shuffle(all_articles) #make bigrams for every article word_bigrams = [(nltk.bigrams(all_articles[i][0])) for i in range(len(all_articles))] #create frequency distribution for all words and select top 2000 for features all_words = nltk.FreqDist(article.words())
########## CATEGORIZED CORPUS READER ############### from nltk.corpus import brown print brown.categories() print brown.tagged_sents(categories=['news']) from nltk.corpus.reader import CategorizedPlaintextCorpusReader as CPCR root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #Using cat_pattern reader=CPCR(root ,r'movie_.*\.txt' ,cat_pattern=r'movie_(\w+)\.txt') print reader.categories() print reader.fileids(categories=['neg']) print reader.fileids(categories=['pos']) #Using cat_map: a dictionary mapping a ""fileid arg"" to a ""list of category labels"" reader=CPCR(root ,r'movie_.*\.txt' ,cat_map={'movie_pos.txt':['pos'],'movie_neg.txt':['neg']}) print reader.categories() #Using cat file: it is a file containing mapping of fileid to category i.e. cats.txt #for more details refer brown corpus folder.
#Using Wordnet Lemmatizer wordnet_lemmatizer = WordNetLemmatizer() all_words = nltk.FreqDist(word for word in reader.words()) top_words = list(all_words)[:100] print(top_words) def word_feats(words): return {word: True for word in words if word in top_words} #def word_feats(words): #return dict([(wordnet_lemmatizer.lemmatize(word), True) for word in words]) # Generate all the files based on ThreatType. IdentityThreat = reader.fileids('IdentityThreat') InsiderThreat = reader.fileids('InsiderThreat') Malware = reader.fileids('Malware') # Identify the words in the datasets as based on ThreatType. IdentityThreatfeats = [(word_feats(reader.words(fileids=[f])), 'IdentityThreat') for f in IdentityThreat] InsiderThreatfeats = [(word_feats(reader.words(fileids=[f])), 'InsiderThreat') for f in InsiderThreat] Malwarefeats = [(word_feats(reader.words(fileids=[f])), 'Malware') for f in Malware] #print (IdentityThreatfeats) IDcutoff = len(IdentityThreatfeats) ITcutoff = len(InsiderThreatfeats) Malcutoff = len(Malwarefeats) print(IDcutoff, ITcutoff, Malcutoff)
def fetch_news(dir): base = 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/{}/rss.xml' for category in ['world', 'technology']: rss = fp.parse(base.format(category)) for i, entry in enumerate(rss.entries): fname = '{0}_bbc_{1}.txt'.format(i, category) fname = os.path.join(dir, fname) if not dl.conf.file_exists(fname): store_txt(entry.link, fname, entry.title) if __name__ == "__main__": dir = os.path.join(dl.data.get_data_dir(), 'bbc_news_corpus') if not os.path.exists(dir): os.mkdir(dir) fetch_news(dir) reader = CategorizedPlaintextCorpusReader(dir, r'.*bbc.*\.txt', cat_pattern=r'.*bbc_(\w+)\.txt') printer = dl.log_api.Printer(nelems=3) printer.print('Categories', reader.categories()) printer.print('World fileids', reader.fileids(categories=['world'])) printer.print('Technology fileids', reader.fileids(categories=['technology']))
data_folder = './data' encoding = 'UTF8' language = 'italian' wordTok = RegexpTokenizer( r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))') sentTok = LineTokenizer() reader = CategorizedPlaintextCorpusReader(data_folder, r'SENTIPOLC-.*\.txt', cat_pattern=r'SENTIPOLC-(\w+)\.txt', encoding=encoding, word_tokenizer=wordTok, sent_tokenizer=sentTok) pos_tweets = reader.sents(reader.fileids('pos')) neg_tweets = reader.sents(reader.fileids('neg')) # Inspection rndP = random.randrange(len(pos_tweets)) rndN = random.randrange(len(neg_tweets)) print 'Pos:\n', pos_tweets[rndP:rndP + 3], '\nNeg:\n', neg_tweets[rndN:rndN + 3], '\n' # All lowercase pos_tweets = Proc.lowerize(pos_tweets) neg_tweets = Proc.lowerize(neg_tweets) # Removing digits pos_tweets = Proc.remove_digits(pos_tweets) neg_tweets = Proc.remove_digits(neg_tweets)
import nltk, random, string from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('./', r'.*\.txt', cat_pattern=r'(\w+)/*') print reader.categories() print reader.fileids() documents = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(documents) # Remove stopwords & punc from content table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) print all_words word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {}
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation, r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None self.terms = None def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents def setStopWords(self, fileLocation=config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self, wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self, size=2000, featureSelection='PD', removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()] if removeStopWords: all_words = self.removeStopWords(all_words) all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self, document, sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else: for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document, category) in self.documents] def __setTermsPD__(self, size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {} negWord = {} for word in self.reader.words(categories=['pos']): inc(posWord, word.lower()) for word in self.reader.words(categories=['neg']): inc(negWord, word.lower()) wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10: # min total count wordScores[word] = 0.1 else: wordScore[word] = abs(posScore - negScore) / totalScore
cat_pattern=r'(\w+)/*') from textblob.classifiers import NaiveBayesClassifier random.seed(1) train = [ ('Identity', 'IdentityThreat'), ('identity', 'IdentityThreat'), ('identities', 'IdentityThreat'), ('identity loss', 'IdentityThreat'), ('insider', 'InsiderThreat'), ('Malware', 'Malware'), ] # Categorized corpora Reader collect the respective words based on ThreatType ThreatTypes = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(ThreatTypes) print(reader.categories()) new_train = ThreatTypes print(new_train) #Naive Bayes classifiers assume that the value of a particular feature is independent of the value of #any other feature, given the class variable. cl = NaiveBayesClassifier(train) #update the classifier with training keywords from Categorized corpora cl.update(new_train) inputpath = nltk.data.find('corpora/abc/threatdescp.txt') f = open(inputpath, encoding='latin2') outputpath = nltk.data.find('corpora/abc/ResultNB.txt') ResultFile = open(outputpath, 'w', encoding='latin2') for line in f: line = BeautifulSoup(line.strip()).text
#From 1491 of cookbook #This is how we would load in the customized corpus from nltk.corpus.reader import CategorizedPlaintextCorpusReader #reader = CategorizedPlaintextCorpusReader('.',r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') #reader = CategorizedPlaintextCorpusReader('.',r'movie_.*\.txt', cat_map={'movie_pos.txt':['pos'],'movie_next.txt':['neg']}) reader = CategorizedPlaintextCorpusReader('./nltk_data/custom_corpora/',r'content_.*\.txt', cat_map={'content_good.txt':['good'],'content_bad.txt':['bad']}) reader.categories() #['neg','pos'] reader.fileids(categories=['good']) #['movie_neg.txt'] reader.fileids(categories=['bad']) #['movie_pos.txt'] #location 3442 #extract features from the corpus def bag_of_words(words): return dict([(word, True) for word in words]) def bag_of_words_not_in_set(words, badwords): return bag_of_words(set(words) - set(badwords)) from nltk.corpus import stopwords def bag_of_non_stopwords(words, stopfile='english'): badwords = stopwords.words(stopfile) return bag_of_words_not_in_set(words, badwords) from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures
{ "category": cat, "doc": doc, "tot_words": tot_words, "avg_char": avg_char, "sentences": sentences, "avg_word": avg_word, "most_common": common[0][0], "most_common_freq": common[0][1] } ) def clean_string(text): clean_text = text.lower() clean_text = re.sub('[^0-9a-zA-Z //]+', '', clean_text) return clean_text.strip() #create document properties corpus = CategorizedPlaintextCorpusReader( 'C:/Users/gavin_000/Python/texts', r'.*\.txt', cat_pattern=r'(\w+)/*' ) stop = stopwords.words('english') results = pd.DataFrame() for category in corpus.categories(): for document in corpus.fileids(category): doc_properties = create_document_properties(category, document) results = results.append(doc_properties, ignore_index=True) print results
from nltk.stem import SnowballStemmer from processor import Processor as Proc data_folder = './data' encoding = 'UTF8' language = 'italian' wordTok = RegexpTokenizer(r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))') sentTok = LineTokenizer() reader = CategorizedPlaintextCorpusReader(data_folder, r'SENTIPOLC-.*\.txt', cat_pattern=r'SENTIPOLC-(\w+)\.txt', encoding=encoding, word_tokenizer=wordTok, sent_tokenizer=sentTok) pos_tweets = reader.sents(reader.fileids('pos')) neg_tweets = reader.sents(reader.fileids('neg')) # Inspection rndP = random.randrange(len(pos_tweets)) rndN = random.randrange(len(neg_tweets)) print 'Pos:\n', pos_tweets[rndP:rndP+3], '\nNeg:\n', neg_tweets[rndN:rndN+3], '\n' # All lowercase pos_tweets = Proc.lowerize(pos_tweets) neg_tweets = Proc.lowerize(neg_tweets) # Removing digits pos_tweets = Proc.remove_digits(pos_tweets) neg_tweets = Proc.remove_digits(neg_tweets)
from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.tokenize.casual import TweetTokenizer from normalization import normalizeTwitterWordsWithExtraFeatures, normalizeTwitterWordsWithNegationHandle import pickle, nltk tweetTokenizer = TweetTokenizer(reduce_len=True, preserve_case=True, strip_handles=False) corpus = CategorizedPlaintextCorpusReader('corpus/2-step/polar', r'(\w+)-tweet[0-9]+\.txt', cat_pattern=r'(\w+)-tweet[0-9]+\.txt', word_tokenizer=tweetTokenizer) normalizationFunction = normalizeTwitterWordsWithNegationHandle wordsTaggedToCategory = [] i = 1 for category in corpus.categories(): for fileid in corpus.fileids(category): words = corpus.words(fileids=[fileid]) normalizedWords = normalizationFunction(words) extraNormalizedWords = normalizeTwitterWordsWithExtraFeatures(words) wordsTagged = nltk.pos_tag(normalizedWords) wordsTaggedToCategory += [(wordsTagged, category)] print(i) i += 1 with open("wordsTaggedToCategory-polar", 'wb') as fileout: pickle.dump(wordsTaggedToCategory, fileout)
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None; self.terms = None; def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents; def setStopWords(self,fileLocation = config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self,wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self,size=2000,featureSelection='PD',removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " elif featureSelection == 'SWNSS': self.__setTermsSWNSS__(size) print "Feature Selection : SWNPD :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()]; if removeStopWords: all_words = self.removeStopWords(all_words); all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self,document,sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else : for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document,category) in self.documents] def __setTermsPD__(self,size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {}; negWord = {}; for word in self.reader.words(categories = ['pos']): inc(posWord,word.lower()); for word in self.reader.words(categories = ['neg']): inc(negWord,word.lower()); wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10 : # min total count wordScores[word] = 0.1 else : wordScores[word] = abs(posScore-negScore)/totalScore #removeStopWords does no affect accurcy termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
print(generate_model(cfd, word)) def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1. Construir Corpus texto categorizado locPT = 'ch02/ES' corpusPT = CategorizedPlaintextCorpusReader(locPT, '.*\.txt', cat_file="cat.txt") print(corpusPT.fileids()) print(corpusPT.categories()) print(corpusPT.words(categories='ciencia')) #print(corpusPT.raw()) vocab = set(w.lower() for w in corpusPT.words()) print('Tamanho Vocabulario:', len(vocab)) corpusCom = corpusPT.raw() corpusComList = corpusCom.split() print('Tamanho Total de palabras:', len(corpusComList)) # 2. Calcular medidas estadisticas simples ''' Medidas: Tamanho médio das palavras, Tamanho médio das sentenças e Número de vezes que cada item do vocabulário aparece no texto em média (escore de diversidade léxica) '''
file=open('model_file.txt', 'w') file.write("Feature words\tCategory\tProb(given_word|category)\n") for word, cat in Feature_Set.keys(): file.write(str(word)) file.write("\t\t") file.write((str(cat))) file.write("\t\t") file.write(str(Feature_Set[word,cat])) file.write("\n") file.close() Classification_Accuracy=0 for file in Testing_Corpus.fileids(): pos_prob=1 neg_prob=1 real_category=Testing_Corpus.categories([file]) for word, cat in Feature_Set: if word in Testing_Corpus.words([file]): if cat=="pos": pos_prob=Feature_Set[word, cat]*float(pos_prob)*10000 else: neg_prob=Feature_Set[word, cat]*float(neg_prob)*10000 if float(pos_prob)>=float(neg_prob): derived_category="['pos']" else: derived_category="['neg']"
import nltk, random, string, os from nltk.collocations import * from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords bigram_measures = nltk.collocations.BigramAssocMeasures() #print reader.categories() for name in os.listdir("."): if os.path.isdir(name): reader = CategorizedPlaintextCorpusReader(name, r'.*\.txt', cat_pattern=r'(\w+)/*') # reader = CategorizedPlaintextCorpusReader(name, r'./raw_reviews/\.txt', cat_pattern=r'(\w+)/*') print reader.fileids() table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] #all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) finder = BigramCollocationFinder.from_words(filtered_words_nopunc) #scored = finder.score_ngrams(bigram_measures.raw_freq) #a = sorted(bigram for bigram, score in scored) finder.apply_freq_filter(3) a = finder.nbest(bigram_measures.pmi, 5) #b = finder.score_ngrams(bigram_measures.pmi) print a
data = get_data() print(len(data)) evrth, maindict = tags_assignment(data) # Save new final dictionary as well as the mapping for categories-numbers listingssss = json.dumps(evrth) with open("FinalCleanJuly1.json", "w") as f: f.write(listingssss) dictionaries = json.dumps(maindict) with open("CorpusCatMapJuly1.json", "w") as f: f.write(dictionaries) #### This is IMPORTANT - CHOOSE ! ##### default is key2 #### Choose the label you want to have for naming! ### two options: ### 1) key1 with format: docID + _(i) where i numerated number of category e.g. -doc-_cr14021.txt ### 2) key2 with format country name + year + _(i) e.g. Albania2015_1.txt ### if you want to change--> line 90: "key2: taglist" to key1 ### line 121: filename=evrth[i]['key2'] to key1 create_corpus(evrth) #### Check if working reader = CategorizedPlaintextCorpusReader('corpusCategory/', r'\w+\d+_.*\.txt', cat_map=maindict) print(reader.categories()) #print all categories in a list print(reader.fileids(categories=['Fiscal'])) #check docIDs in fiscal category #Good reference - https://www.packtpub.com/books/content/python-text-processing-nltk-20-creating-custom-corpora #They have options for creating chunked (by words, sentences, paragraphs and even customized paragraphs) corpora, tagged corpora etc
import nltk.classify.util, nltk.metrics from nltk.metrics import * from nltk.classify import NaiveBayesClassifier from nltk.corpus import stopwords from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist #Creating Corpus using WordListCorpusReader from nltk.corpus.reader import CategorizedPlaintextCorpusReader imdb_reviews = CategorizedPlaintextCorpusReader( 'D://USF//Independent Research Project//Dataset//Movie Review Dataset Pos Neg//aclImdb//train//negpos', r'.*\.txt', cat_pattern=r'(\w+)/*') len(imdb_reviews.fileids()) def evaluate_classifier(featx): negids = imdb_reviews.fileids('neg') posids = imdb_reviews.fileids('pos') negfeats = [(featx(imdb_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(imdb_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
count_pos += 1 elif row['rating_overall'] in ("1", "2", "3", "4", "5") and count_neg < 15000: output = open('airline_review\\neg\\' + \ filename + '.txt', 'w') output.write(row['reviewcontent']) output.close() count_neg += 1 os.chdir("E:/Documents/GSU/Python Development/Unstructured Data/Team Project/machine_learning_text_analysis") reader = CategorizedPlaintextCorpusReader('./airline_review', r'.*\.txt', cat_pattern = r'(\w+)/*') # file name format # Positive reviews file ids pos_ids = reader.fileids('pos') # Negative reviews file ids neg_ids = reader.fileids('neg') '''Generating word feature list''' def word_feats(words): return dict([(word, True) for word in words]) '''Building positive and negative feature lists. Each item is the positive/negative word features for a review file''' pos_feat = [(word_feats(reader.words(fileids = f)), 'pos') for f in pos_ids] neg_feat = [(word_feats(reader.words(fileids = f)), 'neg') for f in neg_ids]