Exemplo n.º 1
0

# 1. Construir Corpus texto categorizado
locPT = 'ch02/ES'
corpusPT = CategorizedPlaintextCorpusReader(locPT,
                                            '.*\.txt',
                                            cat_file="cat.txt")

print(corpusPT.fileids())
print(corpusPT.categories())
print(corpusPT.words(categories='ciencia'))
#print(corpusPT.raw())

vocab = set(w.lower() for w in corpusPT.words())
print('Tamanho Vocabulario:', len(vocab))
corpusCom = corpusPT.raw()
corpusComList = corpusCom.split()
print('Tamanho Total de palabras:', len(corpusComList))

# 2. Calcular medidas estadisticas simples
'''
Medidas: Tamanho médio das palavras, Tamanho médio das sentenças e Número de vezes que cada
item do vocabulário aparece no	texto em média (escore de diversidade léxica)
'''
print(
    'Tamanho médio das palavras/Tamanho médio das sentenças/Escore de diversidade léxica'
)
print('Medidas con StopWords de Corpus')
estadisticasSimples(corpusPT, 0, 0)

print('Medidas con StopWords x Categoria')
Exemplo n.º 2
0
#        "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]
#colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"]
#colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

#data = {}
stats = {}
common = {}

for coll in colls:
    print(reader.categories(coll+".txt"))
    stats[coll] = {}
    # 'kay. Can't pickle words. It's a stream reader.
    # But maybe you can if you tokenize we regex
    # Which also pulls out punctuation
    print("prep & pickle words")
    words = re.split(r'\W+', reader.raw(coll+'.txt'))
    pickle.dump( words, open( "/media/storage/dpla-data/pickles/new/"+coll+"_words.p", "wb"))
    #words = reader.words(coll+".txt")
    #data[coll]["words"] = reader.words(coll+".txt")
    print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["wc"] = len(words)
    print(stats[coll]["wc"])
    print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["uniq"] = len(set([w.lower() for w in words]))
    print(stats[coll]["uniq"])
    print("filtering & pickling " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    filtered = [w for w in words if w.lower() not in nltk.corpus.stopwords.words('english')]
    pickle.dump( filtered, open( "/media/storage/dpla-data/pickles/new/"+coll+"_filtered.p", "wb"))
    print("getting filter count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["fwc"] = len(filtered)
    print(stats[coll]["fwc"])
import nltk
from nltk.corpus import wordnet as wn
import sys

Feature_Set={}


training_directory= "reviews"
Training_Corpus = CategorizedPlaintextCorpusReader(training_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*')

testing_directory= "reviews"
Testing_Corpus  = CategorizedPlaintextCorpusReader(testing_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*')



Training_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw())
Positive_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="pos"))
Negative_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="neg"))

Training_Vocabulary = nltk.FreqDist(w.lower() for w in Training_Corpus_Text)
Positive_Vocabulary = nltk.FreqDist(w.lower() for w in Positive_Corpus_Text)
Negative_Vocabulary = nltk.FreqDist(w.lower() for w in Negative_Corpus_Text)


pos_den=float(len(Positive_Corpus_Text))+float(len(Positive_Vocabulary.keys()))
neg_den=float(len(Negative_Corpus_Text))+float(len(Negative_Vocabulary.keys()))

for word, tag in nltk.pos_tag(Training_Vocabulary.keys()[:500]):
    if tag=="JJ" or tag=="RB":
        if word in Positive_Vocabulary.keys():
                    Feature_Set[word, "pos"]=(float((Positive_Vocabulary.freq(word)*len(Positive_Corpus_Text))+1)/float(pos_den))
Exemplo n.º 4
0
        "hathi","nypl"]
#colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"]
#colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

#data = {}
stats = {}
common = {}

for coll in colls:
    print(reader.categories(coll+".txt"))
    stats[coll] = {}
    # 'kay. Can't pickle words. It's a stream reader.
    # But maybe you can if you tokenize we regex
    # Which also pulls out punctuation
    print("prep & pickle words")
    words = re.split(r'\W+', reader.raw(coll+'.txt'))
    pickle.dump( words, open( "/media/storage/dpla-data/words/colls.oct/pickles/"+coll+"_words.p", "wb"))
    #words = reader.words(coll+".txt")
    #data[coll]["words"] = reader.words(coll+".txt")
    print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["wc"] = len(words)
    print(stats[coll]["wc"])
    print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["uniq"] = len(set([w.lower() for w in words]))
    print(stats[coll]["uniq"])
    print("filtering & pickling " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    filtered = [w for w in words if w.lower() not in nltk.corpus.stopwords.words('english')]
    pickle.dump( filtered, open( "/media/storage/dpla-data/words/colls.oct/pickles/"+coll+"_filtered.p", "wb"))
    print("getting filter count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["fwc"] = len(filtered)
    print(stats[coll]["fwc"])
Exemplo n.º 5
0
art_i = []
class_i = []
#Conversion of Train Data into Single Input File
corpus_root = 'Train_set'

newcorpus = CategorizedPlaintextCorpusReader(corpus_root,
                                             r'.*\.txt',
                                             cat_pattern=r'(\w+)/*')

myfile = open('Input_Article_Data.csv', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n")

for category in newcorpus.categories():
    for fileid in newcorpus.fileids(category):
        #print fileid,category
        data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ")
        data_list = [data1, category]
        wr.writerow(data_list)

myfile.close()

#Reading of Train Data as Lists
with open('Input_Article_Data.csv', 'r') as f:
    for line in f.readlines():
        l, name = line.strip().split(',')
        l = (re.sub('[^A-Za-z0-9.]+', ' ', l)).lower()
        # l=porter_stemmer.stem(l) #Reduces Accuracy From 50% To 37%
        if (name != "Category"):
            art_i.append([l])
            class_i.append(name)
f.close()
Exemplo n.º 6
0
"""
# Recurso de emociones CANADA
emotions_dict = pd.read_csv("emolex.csv")
emotions_dict = emotions_dict.set_index('Spanish (es)')

# Recurso de emociones SEL
sel_emotions_dict = pd.read_csv("SEL_full.txt", sep='\t', encoding = "ISO-8859-1")
sel_emotions_dict = sel_emotions_dict.set_index('Palabra')
"""

# Lee corpus de tweets
reader = CategorizedPlaintextCorpusReader('./',
                                          r'mex.*\.txt',
                                          cat_pattern=r'(\w+)/*')

tweets_train = reader.raw('mex_train.txt').split('\n')[:-1]
labels_train = reader.raw('mex_train_labels.txt').split('\n')[:-1]
labels_train = list(map(int, labels_train))

tweets_val = reader.raw('mex_val.txt').split('\n')[:-1]
labels_val = reader.raw('mex_val_labels.txt').split('\n')[:-1]
labels_val = list(map(int, labels_val))

tweets_test = reader.raw('mex_test.txt').split('\n')[:-1]
"""
corpus_palabras = []
for doc in tweets_train:
    corpus_palabras += doc.split()
fdist = nltk.FreqDist(corpus_palabras)

V = sortFreqDict(fdist)