def create_vocabularies(): poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*', cat_file='cats.txt') for emotion in base_emotions: words = poem_corpus.words(categories=[emotion]) words = [w.lower() for w in words if w.isalpha() and w not in stopwords.words('english')] fdist = nltk.FreqDist(words) vocabulary = fdist.keys()[:200] vocab_file = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'w') vocab_file.write('\n'.join(vocabulary)) vocab_file.close()
def __init__(self, dir, doc): self.doc = doc self.dir = dir self.eng_stopw = stopwords.words('english') text_corpus = CategorizedPlaintextCorpusReader( './%s/' % self.dir, r'.*\.csv', # leggo solamente i file che terminato con .csv cat_pattern=r'(\w+)/*', # prendi tutto quello che c'è dopo la directory encoding='latin-1' ) self.text = nltk.Text(text_corpus.words(self.doc))
def __init__(self, raiz_corpus): """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader', utilizando o diretório raiz do corpus, onde os documentos estão localizados, dispostos em seus respectivos subdiretórios, de acordo com sua categoria, sejam eles/elas quais for --> raiz_corpus/{pos,neg,neu,...}. """ reload(sys) sys.setdefaultencoding("utf-8") self._raiz_corpus = raiz_corpus self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8') self._documentos = None self._palavras_frequentes = None self._todas_palavras = None self._featuresets = None self._train_set = None self._test_set = None
def __create_corpus(self, language, chars): """Create a categorized nltk.corpus from data/* where the subfolders are the different categories. :chars: List of chars which will be additionally to stopwords removed before the statistical analysis :language: The newspaper language as string :returns: nltk.corpus, list(all normalized words) """ # Create corpus from data directory news_corpus = CategorizedPlaintextCorpusReader('data/', r'.*\.txt', cat_pattern=r'(\w+)/*') # Get all german stopwords and addition chars for removal g_stop = stopwords.words(language) g_stop.extend(chars) # Stemmer snow = nlp.stem.SnowballStemmer(language, ignore_stopwords=True) # Dict of all words/category cat = news_corpus.categories() total_words = {} for news in cat: #Get the words words = news_corpus.words(categories=news) # Remove stopwords and tokenize words = [w.lower() for w in words if w not in g_stop] # Stem all tokens words = [snow.stem(w) for w in words] total_words.update({news: words}) return news_corpus, total_words
def init_documents(f_re, cat_re): logging.debug("Reading corpus") reports = CategorizedPlaintextCorpusReader(corpus_dir, f_re, cat_pattern=cat_re, encoding='utf8') logging.debug("Found {} fileids".format(len(reports.fileids()))) logging.debug("Found categories: {}".format(reports.categories())) logging.debug("Building docs") documents = [ (tokenize(reports.words(i)), reports.categories(i)[0]) for i in reports.fileids()] return documents
def nltk(): #### FOR TRAINING DATA #### stop = stopwords.words('spanish') # Reads the training data. traindir = '/Users/ruben/Desktop/Formularios_clasificados/training' mr = CategorizedPlaintextCorpusReader(traindir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # Converts training data into tuples of [(words,label), ...] documents = [([w for w in mr.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] # Extract training features. word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] # Assuming that you're using full data set # since your test set is different. train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents] #### TRAINS THE TAGGER #### # Train the tagger classifier = NaiveBayesClassifier.train(train_set) #### FOR TESTING DATA #### # Now do the same reading and processing for the testing data. testdir = '/Users/ruben/Desktop/Formularios_clasificados/testing' mr_test = CategorizedPlaintextCorpusReader(testdir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # Converts testing data into tuples of [(words,label), ...] test_documents = [ ([w for w in mr_test.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in mr_test.fileids()] # Reads test data into features: test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in test_documents] correct = 0 wrong = 0 #### Evaluate the classifier #### for doc, gold_label in test_set: tagged_label = classifier.classify(doc) if tagged_label == gold_label: correct += 1 else: wrong += 1 print correct, wrong, (float(correct) / wrong + correct)
def construct_model(copusPath, modelPath): mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') stop = stopwords.words('french') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = list(word_features.keys()) numtrain = int(len(documents) * 100 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]]""" classifier = nbc.train(train_set) mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mrtest.fileids()] word_features_test = FreqDist(chain(*[i for i, j in documentsTest])) word_features_test = list(word_features_test.keys()) numtrain_test = int(len(documentsTest) * 100 / 100) test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag in documentsTest[:numtrain_test]] save_classifier(classifier, modelPath)
def display_features(num_features=1000, show_features=200, filepath='classifiers/nltk_nb.pkl', verbose=True): ''' Displays informative features from NHLCorpus ''' stop_words = set(stopwords.words('english')) nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/', fileids=r'.*\.txt', cat_pattern='(\w+)/*') documents = [] for category in nhl.categories(): for fileid in nhl.fileids(category): documents.append(([ re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid) if w.lower() not in stop_words ], category)) all_words = nltk.FreqDist( re.sub(r'\W+', '', w.lower()) for w in nhl.words() if w.lower() not in stop_words) word_features = [w[0] for w in all_words.most_common(num_features)] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = word in document_words return features featuresets = [(document_features(d), c) for (d, c) in documents] nb_clf = nltk.NaiveBayesClassifier.train(featuresets) if verbose: nb_clf.show_most_informative_features(show_features) print('Accuracy on training data: {}'.format( nltk.classify.accuracy(nb_clf, featuresets))) save_classifier = open(filepath, 'wb') pickle.dump(nb_clf, save_classifier) save_classifier.close()
print(positive_greg) print(negative_greg) positive_consolidated_list = list(pos_list) + positive_greg negative_consolidated_list = list(neg_list) + negative_greg print(positive_consolidated_list) print(negative_consolidated_list) init_notebook_mode(connected=True) cf.set_config_file(offline=True, world_readable=True, theme='ggplot') #%% corpus_root = "/Users/LENOVO USER/Desktop/FedTranscript1" data_m = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin1') data_fileids = data_m.fileids() #%% def corpus_Stats(crp): print('Total number of files: ' + str(len(crp.fileids()))) print('Number of paragraphs: ' + str(len(crp.paras()))) print('Number of sentences: ' + str(len(crp.sents()))) print('Number of words: ' + str(len(crp.words()))) #corpus_Stats(data_m) #print('\n'+'First file: '+ data_fileids[0]) #print('Last file: '+ data_fileids[-1])
''' import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk # working dir: UN/ mydir = 'corpus/unscrs_renamed_categorized' mr = CategorizedPlaintextCorpusReader( mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8') stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]]
import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) classifier.show_most_informative_features(5)
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader from nltk.corpus import brown # Abrir os documentos dentro do caminho específico # Argumentos # 1. Caminho absoluto para os documentos # 2. tipo / extensão dos documentos (*.txt) # 3. indicativo das pastas que formarão as categorias # todos os argumentos são expressões regulares leitor = CategorizedPlaintextCorpusReader( '../Dados/mix20_rand700_tokens_cleaned/tokens/', '.*.txt', cat_pattern=r'(\w+)/*') # Verificar o que foi carregado print(leitor.categories()) print(leitor.fileids()) # Separar o corpus de acordo com as categorias posFiles = leitor.fileids(categories='pos') negFiles = leitor.fileids(categories='neg') print('Arquivos pos:', posFiles) print('Arquivos neg:', negFiles) # Carregar os primeiros arquivos das categorias arqP = posFiles[0] arqN = negFiles[1] print("ArqP: ", arqP)
class CorpusUtil(object): """Documentar """ def __init__(self, raiz_corpus): """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader', utilizando o diretório raiz do corpus, onde os documentos estão localizados, dispostos em seus respectivos subdiretórios, de acordo com sua categoria, sejam eles/elas quais for --> raiz_corpus/{pos,neg,neu,...}. """ reload(sys) sys.setdefaultencoding("utf-8") self._raiz_corpus = raiz_corpus self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8') self._documentos = None self._palavras_frequentes = None self._todas_palavras = None self._featuresets = None self._train_set = None self._test_set = None def get_documentos(self): """Construimos uma lista de documentos, rotulados com as categorias apropriadas. Cada documento é representado por uma tupla na estrutura abaixo: (conteudo_do_documento, categoria) Retorna essa lista com todos os documentos do corpus. """ """ documentos = [(self.corpus.words(fileid), categoria) for categoria in self.corpus.categories() for fileid in self.corpus.fileids(categoria)] """ print "-- Recuperando documentos do corpus." if self._documentos is None: self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid) for categoria in self._corpus.categories() for fileid in self._corpus.fileids(categoria)] # Embaralha documentos for i in range(0, 10): shuffle(self._documentos) return self._documentos def get_palavras_frequentes(self): """Documentar. """ if self._palavras_frequentes is None: print "-- Verificando as palavras mais frequentes do corpus." # Teste - retorna apenas as 2000 palavras mais frequentes do corpus todas_palavras = [word.lower() for word in self._corpus.words()] freq_dist_palavras = FreqDist(todas_palavras) frequencia_palavras = freq_dist_palavras.most_common(2000) # 2000 palavras mais frequentes self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras] # all_words = FreqDist(word.lower() for word in self.corpus.words()) # self.word_features = list(all_words)[:2000] return self._palavras_frequentes def get_todas_palavras(self): if self._todas_palavras is None: print "-- Recuperando todas as palavras do corpus." self._todas_palavras = [word.lower() for word in self._corpus.words()] self._todas_palavras = set(self._todas_palavras) return self._todas_palavras def get_featuresets(self): """Configura os featuresets que são construídos na seguinte estrutura: (features_do_documento, categoria) Retorna uma lista de featuresets """ if self._featuresets is None: if self._documentos is None: self.get_documentos() print "-- Recuperando featuresets." self._featuresets = apply_features(Documento.get_features, self._documentos) return self._featuresets def get_train_set(self): """Documentar """ if self._featuresets is None: self.get_featuresets() print "-- Recuperando train_set." # Para não ocupar toda a memória RAM, # não armazena todos os documentos de uma vez nesta. # self._train_set = apply_features(Documento.get_features, self._documentos[100:]) self._train_set = apply_features(Documento.get_features, self._documentos) return self._train_set def get_test_set(self): if self._featuresets is None: self.get_featuresets() print "-- Recuperando test_set." # self._test_set = apply_features(Documento.get_features, self._documentos[:100]) return self._test_set def gravar_palavras_frequentes(self): diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus" molde_nome_arquivo = "palavras_frequentes_%s.pickle" tempo_agora = str(datetime.now()) # Substitui ':' e espaço em branco por '.' tempo_agora = re.sub(ur':|\s', '.', tempo_agora) nome_arquivo = molde_nome_arquivo % tempo_agora if self._palavras_frequentes is None: self.get_palavras_frequentes() f = open(diretorio_destino + "/" + nome_arquivo, 'wb') pickle.dump(self._palavras_frequentes, f) f.close() return True @staticmethod def abrir_arquivo_palavras_frequentes(arquivo_path): f = open(arquivo_path, 'rb') palavras_frequentes = pickle.load(f) f.close() return palavras_frequentes
negative_words = [word.strip() for word in neg_file.readlines() if not \ word.startswith(';')] pos_file.close() neg_file.close() # Words for all emotions lexicon = {} for emotion in base_emotions: f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU') words = [word.strip() for word in f.readlines()] lexicon[emotion] = words f.close() # Make a classifier based on the feature sets of the poems poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*', cat_file='cats.txt') poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \ for category in poem_corpus.categories(fileid)] random.shuffle(poem_set) feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])), category) for (fileid, category) in poem_set] train_set, test_set = feature_set[2000:], feature_set[:2000] # Initialize the classifier classifier = nltk.NaiveBayesClassifier.train(train_set) # For improving the algorithm classifier.show_most_informative_features(20)
This code uses the meeting records (inputs) corpus. ''' import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk # working dir: UN/ mydir = 'corpus/meeting_records_final_categorized' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?! classifier.show_most_informative_features(20) # for word_features.keys()[:100]
def loadCorpus(category = None) : corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" if not os.name == 'posix': corpus_root = "..\\corpus\\lyric_corpus\\files\\" # load the corpus # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt') corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # print files in corpus # for file in corpus.fileids(): # print(file) # access corpus raw = corpus.raw() words = corpus.words() # print (category) if(category == None): sents = corpus.sents() else: sents = corpus.sents(categories = category) # sents_pop = corpus.sents(categories="POP") # sents_rock = corpus.sents(categories="ROCK") shuffledSents = shuffleSent(sents) numberSents = len(shuffledSents) trainSize = math.floor(numberSents*0.8) testSize = len(shuffledSents) - trainSize # testSize = math.floor(numberSents*0.1) # devSize = len(shuffledSents)-trainSize - testSize trainCorpus = [] testCorpus = [] # devCorpus = [] wholeCorpus = [] testSents = [] for i in range(numberSents): if(i < trainSize): for word in shuffledSents[i]: trainCorpus.append(word) wholeCorpus.append(word) # elif(i < (trainSize + testSize)): # for word in shuffledSents[i]: # testCorpus.append(word) # wholeCorpus.append(word) else: testSents.append(shuffledSents[i]) for word in shuffledSents[i]: testCorpus.append(word) wholeCorpus.append(word) # testCorpus = [] # trainCorpus = list(words) # for i in range(testSize): # seed = random.randrange(0,numberSents - i) # testCorpus.append(trainCorpus.pop(seed)) return wholeCorpus, trainCorpus, testCorpus, testSents
os.chdir(directory) file = open(fname, 'w') file.write(text) file.close() doc_start = {} doc_start[0] = "Staff Review of the Economic Situation" doc_start[1] = re.compile('The information (reviewed|received|provided)') doc_start[ 2] = "The Committee then turned to a discussion of the economic outlook" doc_start[3] = re.compile('The information (reviewed|received|provided)') doc_end = {} doc_end[0] = re.compile( '(At the conclusion of) (this|the) (discussion|meeting)') doc_end[1] = re.compile('(?i)The Committee voted to authorize') doc_end[2] = re.compile('(?i)The vote encompassed approval of') if __name__ == '__main__': corpus_root = '/Users/aaroncgw/Google Drive/fednlp/data/minutes/' data_m = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') data_fileids = data_m.fileids() for f in data_fileids: year, fname = f.split('/') cropped_text = crop_text(data_m.raw(f), doc_start, doc_end) saveFile(fname, year, cropped_text)
def create_corpus(): poem_corpus = CategorizedPlaintextCorpusReader('../poems/', 'poems_.*', cat_file='cats.txt')
import time start = time.time() import nltk from nltk.corpus import sentiwordnet as swn from nltk.corpus import CategorizedPlaintextCorpusReader from sklearn.cluster import KMeans import numpy as np import copy import math import re corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\test' #Path of IMDB Test Data reader = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') r_neg = reader.fileids(categories=['neg']) r_pos = reader.fileids(categories=['pos']) global_shortlisted = [] TEST_GS_POS = [] for i in range(0, 12500): doc = reader.raw(r_pos[i:i + 1]) #doc contains the movie review sentences = nltk.sent_tokenize(doc) senlen = len(sentences) def decontracted(phrase): # specific phrase = re.sub(r"won't", "will not", phrase)
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'/home/smadyastha/Projects/PythonCheck/Dataset/Reviews/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + "") if (w is '.'): print() # /home/smadyastha/Projects/PythonCheck/Dataset/Reviews
from nltk.corpus import CategorizedPlaintextCorpusReader from nltk import bigrams from nltk import trigrams from nltk.collocations import * import nltk corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # word lists word_list_pop = list(corpus.words(categories="POP")) word_list_rock = list(corpus.words(categories="ROCK")) # bigram lists bigram_list_pop = list(bigrams(word_list_pop)) bigram_list_rock = list(bigrams(word_list_rock)) # trigram lists trigram_list_pop = list(trigrams(word_list_pop)) trigram_list_rock = list(trigrams(word_list_rock)) # measures bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # finders default window size is 2 bi_finder_pop = BigramCollocationFinder.from_words(word_list_pop) bi_finder_rock = BigramCollocationFinder.from_words(word_list_rock)
v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf # Provide path to the custom corpora mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus' # Read data from our custom corpora mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*') # Clean lyrics from the English stop words. stop = stopwords.words('english') documents = [(list(mr.words(fileid)), category) for category in mr.categories() for fileid in mr.fileids(category)] classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers' if os.path.exists(classifiers_dir): shutil.rmtree(classifiers_dir) os.makedirs(classifiers_dir)
from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'D:\LEARNING\MISC\DataSet\movieCorpus\review_polarity\txt_sentoken', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + ' ', end='') if (w is '.'): print() for w in reader.words(fileN): print(w + ' ', end='') if (w is '.'): print()
def transform(corpus: CategorizedPlaintextCorpusReader, target_root_dir): if not os.path.exists(target_root_dir): os.makedirs(target_root_dir) open(target_root_dir + "\\meta.info", 'w').write("tagged\nmarks.txt") for fileid in corpus.fileids(): yield process(corpus, target_root_dir, fileid)
from nltk.corpus import CategorizedPlaintextCorpusReader import ProcessText d1 = "judge people by what they say" d1_processed = ProcessText.ProcessText.process(d1) documents = [d1] #Read documents reader = CategorizedPlaintextCorpusReader( r'\Users\JoeDi\Desktop\MSC\Idioms Corpera', r'.*\.txt', cat_pattern=r'(\w+)/*') for w in reader.fileids(): wd = reader.raw(w) documents.append(w + " " + wd) print("Documents in the collection are: ") print(documents) print("\n") from sklearn.feature_extraction.text import TfidfVectorizer #build a TF/IDF matrix for each description tfidf = TfidfVectorizer().fit_transform(documents) print("Tf-idf weightings are: ") print(tfidf) print("\n")
def read_corpus(root_dir): return CategorizedPlaintextCorpusReader(root_dir, FILE_PATTERN, cat_pattern=CAT_PATTERN)
sjar = '/Users/nischikata/PycharmProjects/JabRef-2.11.1.jar' from nltk.corpus import stopwords from nltk.corpus import CategorizedPlaintextCorpusReader from nltk import word_tokenize from nltk import TreebankWordTokenizer import nltk.data # PLAINTEXT CORPUS READER # http://www.nltk.org/_modules/nltk/corpus/reader/plaintext.html#CategorizedPlaintextCorpusReader # important: The TreebankWordTokenizer separates words like "don't" into "do", "n't", consequently the main verb is correctly identified. # For the Naive Bayes it may be better though to use WordPunctTokenizer - it is the default, so just omit the word_tokenizer param corpus = CategorizedPlaintextCorpusReader( '.', r'(?!\.).*\.txt', word_tokenizer=TreebankWordTokenizer(), cat_pattern=r'(aggressive|not_aggressive)/.*', encoding='utf8') # Getting RAW SENTENCES from RAW Comment seee: http://stackoverflow.com/a/4576110/4866678 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # returns a list of raw sentences def get_raw_sentences(fileid): # works data = corpus.raw(fileid) return tokenizer.tokenize(data) def get_raw_paragraph( fileid
from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'/Users/dechamoungsri/NLP_Learning/NLP_tutotial/mix20_rand700_tokens_cleaned/tokens/', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + ' ', end='') if (w is '.'): print() for w in reader.words(fileN): print(w + ' ', end='') if (w is '.'): print()
def classify_emails(): stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() mydir = '/home/ubuntu/nltk_data/corpora/gmail' all_words = [] filtered_words = [] removedPuncuations_words = [] lematized_words = [] test_filter = [] mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] def word_feats(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return dict(features) negids = mr.fileids('hotel') posids = mr.fileids('flight') neutralids = mr.fileids('other') negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids] posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids] neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 neutralcutoff = len(neutralfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:] classifier = nltk.NaiveBayesClassifier.train(trainfeats) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100) print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100) file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) test_sent_features = {word.lower(): (word in tokens) for word in mr.words()} file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) tri_tokens = trigrams(tokens) cities = [] matchedIndex = [] tokenized = [] addresses = [] district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella'] for i in tokens: tokenized.append(i) pattern = re.compile("\d+") for i in tokenized: if pattern.match(i): matchedIndex.append(tokenized.index(i)) print ("match"+i) print (tokenized.index(i)) else: print ("not match") for t in tokenized: for i in district: if t.lower()==i.lower(): cities.append(tokenized.index(t)) distance= 200 start = 0 end = 0 for t in cities: for i in matchedIndex: dis = t-i; if (dis<=distance and dis>0): distance=dis start=t end=i else: print ("higher") address = "" for token in range(end,start+1): address+=tokenized[(token)] print (address) addresses.append(address) for address in addresses: try: search = geocoder.get(address) except ValueError: continue first_result = search[0] output = [first_result.geometry.location.lat,first_result.geometry.location.lng] stri = ','.join(map(str, output)) return stri
from nltk.corpus import CategorizedPlaintextCorpusReader corpus_root = "./files/" cat_root = "../categories/" # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # get all categories cats = corpus.categories() print(cats) # access corpus raw = corpus.raw() # access words, normal and for a category words = corpus.words() words_pop = corpus.words(categories="POP") words_rock = corpus.words(categories="ROCK") # access sents, normal and for a category sents = corpus.sents() sents_pop = corpus.sents(categories="POP") sents_rock = corpus.sents(categories="ROCK") # make lists word_list = list(words) sents_list = list(sents) pop_word_list = list(words_pop) pop_sents_list = list(sents_pop)
import collections import nltk.classify.util, nltk.metrics from nltk.classify import NaiveBayesClassifier from nltk.classify import DecisionTreeClassifier from nltk.corpus import CategorizedPlaintextCorpusReader from sklearn import svm from sklearn.svm import LinearSVC import string from tabulate import tabulate corpus_root1='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/train' train=CategorizedPlaintextCorpusReader(corpus_root1,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt') corpus_root2='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/test' test=CategorizedPlaintextCorpusReader(corpus_root2,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt') def evaluate_classifier_Naive(featx): train_negids = train.fileids('neg') train_posids = train.fileids('pos') test_negids = test.fileids('neg') test_posids = test.fileids('pos') train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids] train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids] test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids] test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids] trainfeats = train_negfeats + train_posfeats testfeats = test_negfeats + test_posfeats Naive_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets_Naive = collections.defaultdict(set)
#Downloading an external corpus, load it, and access it from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint #random # The first line is where you are reading the corpus by calling # the CategorizedPlaintextCorpusReader constructor. # The three arguments from left to right are Absolute Path # to the folder containing the corpus on your computer, all sample # document names from the txt_sentoken folder, and the categories # in the given corpus (in our case, 'pos' and 'neg' reader = CategorizedPlaintextCorpusReader(r'\Users\JoeDi\Desktop\python projs\tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) # Now that we've made sure that the corpus is loaded correctly, let's # get on with accessing any one of the sample documents from both the categories. # For that, let's first create a list, each containing samples of both the categories, 'pos' and 'neg', respectively. # Add the following two lines of code: posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') # The next two files select a random file, each from the set of positive # and negative category reviews. The last two lines just print the filenames. fileP = posFiles[randint(0,len(posFiles)-1)] fileN = negFiles[randint(0, len(posFiles) - 1)]
import bokeh corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" # basic measures taken from nltk book def lexical_diversity(text): return len(set(text)) / len(text) def percentage(count, total): return 100 * count / total # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # NLTK brow selection word_list_brown = brown.words() sents_list_brown = brown.sents() vocabulary_brown = set(word_list_brown) brown_len_words = len(word_list_brown) brown_len_sents = len(sents_list_brown) brown_len_vocab = len(vocabulary_brown) brown_richness = lexical_diversity(word_list_brown) # Lyric corpus cats = corpus.categories() print(len(cats))
if k in emotion_of_poems and v == max_value: emotion = k print(emotion) emotion_correct = {"emotion": emotion} return emotion_correct break return emotion_correct def classify(poem_text): return classifier.classify(features_of_poem(poem_text)) corpus_of_poems = CategorizedPlaintextCorpusReader('poems/', 'poems.*', cat_file='cats.txt') #code for generating errors # Return errors in order to improve algorithm def errors_em(poem_set): errors = [] for (fileid, category) in poem_set: poem = corpus_of_poems.words(fileids=[fileid]) emotion_correct = features_of_poem(poem) guess = classifier.classify(features_of_poem(poem)) if guess != category: errors.append((category, guess, poem, emotion_correct['emotions']))
# 1장 말뭉치와 워드넷 - 외부 말뭉치 다운로드, 로드하고 액세스하기 from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint # 말뭉치 읽기 reader = CategorizedPlaintextCorpusReader(r'/workspace/NLP_python/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) # 샘플 문서 출력 # pos, neg 카테고리의 샘플 목록 posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') # pos, neg 카테고리에서 각각 임의의 파일 선택 fileP = posFiles[randint(0, len(posFiles)-1)] fileN = negFiles[randint(0, len(negFiles)-1)] print(fileP) print(fileN) # 액세스한 임의 파일을 문장으로 출력 for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
# Build corpus for specific problem set problem = 'problemA' problem_root = nltk.data.find('corpora/AAAC/%s' % (problem)) problem_files = PlaintextCorpusReader(problem_root, '.*\.txt') # Categorize corpus by author auth_map = {} for filename in problem_files.fileids(): a_n = filename[:3] auth_map[filename] = [a_n] # By the entire corpus problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map) documents = [(list(problem_cat.words(fileid)), category) for category in problem_cat.categories() for fileid in problem_cat.fileids(category)] random.shuffle(documents) # Word Frequency featureset # Word freq accross corpus all_words = nltk.FreqDist(words.lower() for words in problem_cat.words()) key_words = all_words.keys()[:2000] # Compares whether a word from the keywords is in a document def doc_features(doc): doc_words = set(doc)
import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]]
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader from pylab import * import plotly.plotly as py import plotly.graph_objs as go corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') words = corpus.words() #frequency distribution popWords = corpus.words(categories="POP") rockWords = corpus.words(categories="ROCK") #print("-----All words-----") fd = nltk.FreqDist(words) ALL_FrequentWords = fd.most_common(104) ALL_FrequentWords_50_100 = [] for i in range(54,104): ALL_FrequentWords_50_100.append(ALL_FrequentWords[i]) #print(ALL_FrequentWords) #print("-----All POP words-----") fd_POP = nltk.FreqDist(popWords) POP_FrequentWords = fd_POP.most_common(60) #print(fd1.most_common(60))
def __init__(self, root, fileids=FILE_PATTERN, **kwargs): if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedPlaintextCorpusReader.__init__( self, root, fileids, cat_pattern=kwargs['cat_pattern'])
print strftime("%Y-%m-%d %H:%M:%S", gmtime()) # Uni machinename = 'maj27' j = 0 for i in range(10): dataset = str(i + 1) #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training' train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val' test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing' #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+'' preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p' train_Corpus = CategorizedPlaintextCorpusReader(train_dir, r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') train_documents = [(list(train_Corpus.words(fileid)), category) for category in train_Corpus.categories() for fileid in train_Corpus.fileids(category)] only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents] only_docs = [ ' '.join(normalize_text(document, lemmatize=True, remove_stop=None)) for document in only_docs ] ####################################################################################### train_labels = [category for (doc, category) in train_documents] train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/ from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint reader = CategorizedPlaintextCorpusReader( r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileN) print(fileP) for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk import sys import os mydir_train = '.\\Docs-txt\\train' mydir_test = '.\\Docs-txt\\test' featureVector_train = [] featureVector_test = [] mr_train = CategorizedPlaintextCorpusReader( mydir_train, r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') mr_test = CategorizedPlaintextCorpusReader( mydir_test, r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') stop = stopwords.words('english') with open('.\\stopwords.txt') as f: stop = f.read().splitlines() documents_train = [([ w for w in mr_train.words(i)