示例#1
0
    def test_corpus(self):

        with open("../data/pt_BR/nnp") as f:
            nnp = [line.rstrip() for line in f.readlines()]
        with open("../data/pt_BR/terms") as f:
            terms = [line.rstrip() for line in f.readlines()]
        with open("../data/pt_BR/patterns") as f:
            patterns = [line.rstrip() for line in f.readlines()]

        data = LoadData(['../corpus/sel1.csv', '../corpus/sel2.csv']).load()
        p = PreProcessing(nnp, terms, patterns)

        tokens = []
        for d in data.values():
            tokens += p.clean_and_stem(d)

        bow, bow_features_names = p.build_bow(tokens)
        dist = np.sum(bow.toarray(), axis=0)
        tbow = {}
        for term, count in zip(bow_features_names, dist):
            tbow[term] = count

        import operator
        with open("bow", "w") as f:
            f.write(str(len(tbow)))
            f.write(
                str(
                    sorted(tbow.items(),
                           key=operator.itemgetter(1),
                           reverse=True)))

        terms = p.compute_tfidf(data.values(), eliminate_zeros=True)
        with open("terms", "w") as f:
            f.write(str(terms))
示例#2
0
    def test(self):
        p = PreProcessing([], [], [])
        cts = machado.fileids()[:5]

        tokens = []
        for c in cts:
            text = machado.raw(c)
            tokens += p.clean_and_stem(text)

        bow, bow_features_names = p.build_bow(tokens)
        dist = np.sum(bow.toarray(), axis=0)
        tbow = {}
        for term, count in zip(bow_features_names, dist):
            tbow[term] = count

        import operator
        print sorted(tbow.items(), key=operator.itemgetter(1), reverse=True)

        texts = {}
        for c in cts:
            text = machado.raw(c)
            texts[c] = text

        terms = p.compute_tfidf(texts.values(), top_n=10, eliminate_zeros=True)
        print terms