def test(self): p = PreProcessing([], [], []) cts = machado.fileids()[:5] tokens = [] for c in cts: text = machado.raw(c) tokens += p.clean_and_stem(text) bow, bow_features_names = p.build_bow(tokens) dist = np.sum(bow.toarray(), axis=0) tbow = {} for term, count in zip(bow_features_names, dist): tbow[term] = count import operator print sorted(tbow.items(), key=operator.itemgetter(1), reverse=True) texts = {} for c in cts: text = machado.raw(c) texts[c] = text terms = p.compute_tfidf(texts.values(), top_n=10, eliminate_zeros=True) print terms
def test_corpus(self): with open("../data/pt_BR/nnp") as f: nnp = [line.rstrip() for line in f.readlines()] with open("../data/pt_BR/terms") as f: terms = [line.rstrip() for line in f.readlines()] with open("../data/pt_BR/patterns") as f: patterns = [line.rstrip() for line in f.readlines()] data = LoadData(['../corpus/sel1.csv', '../corpus/sel2.csv']).load() p = PreProcessing(nnp, terms, patterns) tokens = [] for d in data.values(): tokens += p.clean_and_stem(d) bow, bow_features_names = p.build_bow(tokens) dist = np.sum(bow.toarray(), axis=0) tbow = {} for term, count in zip(bow_features_names, dist): tbow[term] = count import operator with open("bow", "w") as f: f.write(str(len(tbow))) f.write( str( sorted(tbow.items(), key=operator.itemgetter(1), reverse=True))) terms = p.compute_tfidf(data.values(), eliminate_zeros=True) with open("terms", "w") as f: f.write(str(terms))
def test_should_compute_tdidf(self): p = PreProcessing(["joao", "maria"], [], ["\d+", "nomeemp*"]) text_1 = "O técnico João foi até a casa da cliente Maria (NOMEEMPRESA) e solucionou o problema. " \ "Ele não foi solucionado? NomeempProd" text_2 = "A cliente Maria disse que continua sem sinal de Internet e " \ "reclamou que o problema não foi resolvido, ela continua sem sinal" text_3 = "Maria solicitou reparo, cliente reclama que esta sem sinal de Internet e Telefone após chuva" texts = [text_1, text_2, text_3] terms = p.compute_tfidf(texts) print terms import operator print sorted(terms.items(), key=operator.itemgetter(1), reverse=True)