# 1. Construir Corpus texto categorizado locPT = 'ch02/ES' corpusPT = CategorizedPlaintextCorpusReader(locPT, '.*\.txt', cat_file="cat.txt") print(corpusPT.fileids()) print(corpusPT.categories()) print(corpusPT.words(categories='ciencia')) #print(corpusPT.raw()) vocab = set(w.lower() for w in corpusPT.words()) print('Tamanho Vocabulario:', len(vocab)) corpusCom = corpusPT.raw() corpusComList = corpusCom.split() print('Tamanho Total de palabras:', len(corpusComList)) # 2. Calcular medidas estadisticas simples ''' Medidas: Tamanho médio das palavras, Tamanho médio das sentenças e Número de vezes que cada item do vocabulário aparece no texto em média (escore de diversidade léxica) ''' print( 'Tamanho médio das palavras/Tamanho médio das sentenças/Escore de diversidade léxica' ) print('Medidas con StopWords de Corpus') estadisticasSimples(corpusPT, 0, 0) print('Medidas con StopWords x Categoria')
# "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt')) pickle.dump( words, open( "/media/storage/dpla-data/pickles/new/"+coll+"_words.p", "wb")) #words = reader.words(coll+".txt") #data[coll]["words"] = reader.words(coll+".txt") print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["wc"] = len(words) print(stats[coll]["wc"]) print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["uniq"] = len(set([w.lower() for w in words])) print(stats[coll]["uniq"]) print("filtering & pickling " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) filtered = [w for w in words if w.lower() not in nltk.corpus.stopwords.words('english')] pickle.dump( filtered, open( "/media/storage/dpla-data/pickles/new/"+coll+"_filtered.p", "wb")) print("getting filter count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["fwc"] = len(filtered) print(stats[coll]["fwc"])
import nltk from nltk.corpus import wordnet as wn import sys Feature_Set={} training_directory= "reviews" Training_Corpus = CategorizedPlaintextCorpusReader(training_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*') testing_directory= "reviews" Testing_Corpus = CategorizedPlaintextCorpusReader(testing_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*') Training_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw()) Positive_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="pos")) Negative_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="neg")) Training_Vocabulary = nltk.FreqDist(w.lower() for w in Training_Corpus_Text) Positive_Vocabulary = nltk.FreqDist(w.lower() for w in Positive_Corpus_Text) Negative_Vocabulary = nltk.FreqDist(w.lower() for w in Negative_Corpus_Text) pos_den=float(len(Positive_Corpus_Text))+float(len(Positive_Vocabulary.keys())) neg_den=float(len(Negative_Corpus_Text))+float(len(Negative_Vocabulary.keys())) for word, tag in nltk.pos_tag(Training_Vocabulary.keys()[:500]): if tag=="JJ" or tag=="RB": if word in Positive_Vocabulary.keys(): Feature_Set[word, "pos"]=(float((Positive_Vocabulary.freq(word)*len(Positive_Corpus_Text))+1)/float(pos_den))
"hathi","nypl"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt')) pickle.dump( words, open( "/media/storage/dpla-data/words/colls.oct/pickles/"+coll+"_words.p", "wb")) #words = reader.words(coll+".txt") #data[coll]["words"] = reader.words(coll+".txt") print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["wc"] = len(words) print(stats[coll]["wc"]) print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["uniq"] = len(set([w.lower() for w in words])) print(stats[coll]["uniq"]) print("filtering & pickling " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) filtered = [w for w in words if w.lower() not in nltk.corpus.stopwords.words('english')] pickle.dump( filtered, open( "/media/storage/dpla-data/words/colls.oct/pickles/"+coll+"_filtered.p", "wb")) print("getting filter count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["fwc"] = len(filtered) print(stats[coll]["fwc"])
art_i = [] class_i = [] #Conversion of Train Data into Single Input File corpus_root = 'Train_set' newcorpus = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') myfile = open('Input_Article_Data.csv', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n") for category in newcorpus.categories(): for fileid in newcorpus.fileids(category): #print fileid,category data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ") data_list = [data1, category] wr.writerow(data_list) myfile.close() #Reading of Train Data as Lists with open('Input_Article_Data.csv', 'r') as f: for line in f.readlines(): l, name = line.strip().split(',') l = (re.sub('[^A-Za-z0-9.]+', ' ', l)).lower() # l=porter_stemmer.stem(l) #Reduces Accuracy From 50% To 37% if (name != "Category"): art_i.append([l]) class_i.append(name) f.close()
""" # Recurso de emociones CANADA emotions_dict = pd.read_csv("emolex.csv") emotions_dict = emotions_dict.set_index('Spanish (es)') # Recurso de emociones SEL sel_emotions_dict = pd.read_csv("SEL_full.txt", sep='\t', encoding = "ISO-8859-1") sel_emotions_dict = sel_emotions_dict.set_index('Palabra') """ # Lee corpus de tweets reader = CategorizedPlaintextCorpusReader('./', r'mex.*\.txt', cat_pattern=r'(\w+)/*') tweets_train = reader.raw('mex_train.txt').split('\n')[:-1] labels_train = reader.raw('mex_train_labels.txt').split('\n')[:-1] labels_train = list(map(int, labels_train)) tweets_val = reader.raw('mex_val.txt').split('\n')[:-1] labels_val = reader.raw('mex_val_labels.txt').split('\n')[:-1] labels_val = list(map(int, labels_val)) tweets_test = reader.raw('mex_test.txt').split('\n')[:-1] """ corpus_palabras = [] for doc in tweets_train: corpus_palabras += doc.split() fdist = nltk.FreqDist(corpus_palabras) V = sortFreqDict(fdist)