def test_corpus(self): with open("../data/pt_BR/nnp") as f: nnp = [line.rstrip() for line in f.readlines()] with open("../data/pt_BR/terms") as f: terms = [line.rstrip() for line in f.readlines()] with open("../data/pt_BR/patterns") as f: patterns = [line.rstrip() for line in f.readlines()] data = LoadData(['../corpus/sel1.csv', '../corpus/sel2.csv']).load() p = PreProcessing(nnp, terms, patterns) tokens = [] for d in data.values(): tokens += p.clean_and_stem(d) bow, bow_features_names = p.build_bow(tokens) dist = np.sum(bow.toarray(), axis=0) tbow = {} for term, count in zip(bow_features_names, dist): tbow[term] = count import operator with open("bow", "w") as f: f.write(str(len(tbow))) f.write( str( sorted(tbow.items(), key=operator.itemgetter(1), reverse=True))) terms = p.compute_tfidf(data.values(), eliminate_zeros=True) with open("terms", "w") as f: f.write(str(terms))
def test(self): p = PreProcessing([], [], []) cts = machado.fileids()[:5] tokens = [] for c in cts: text = machado.raw(c) tokens += p.clean_and_stem(text) bow, bow_features_names = p.build_bow(tokens) dist = np.sum(bow.toarray(), axis=0) tbow = {} for term, count in zip(bow_features_names, dist): tbow[term] = count import operator print sorted(tbow.items(), key=operator.itemgetter(1), reverse=True) texts = {} for c in cts: text = machado.raw(c) texts[c] = text terms = p.compute_tfidf(texts.values(), top_n=10, eliminate_zeros=True) print terms