Exemplo n.º 1
0
        for ins in testInstances:
            gold[ins[0]] = ins[1]
        
        # READ PREDICTIONS
        filename = word + ".f"+str(fold)+"."+class_name+"."+fs+".out"
        f = open(results_dir + "/" + word + "/" + filename, "r")
        for ins in f:
            ins = ins.rstrip()
            (insId, sense) = ins.split("\t")
            preds[insId] = sense
        f.close()
    
    p_senses = [preds[insId] for insId in sorted(preds.keys())]
    g_senses = [gold[insId] for insId in sorted(gold.keys())]
    cm = ConfusionMatrix(g_senses, p_senses)
    print("")
    try:
        print(cm.pp(sort_by_count=True, show_percents=percents, truncate=9))
    except:
        print(cm.pretty_format(sort_by_count=True, show_percents=percents, truncate=9))

    defs = utils.read_definitions(defpath)
    for sense in sorted(defs[word].keys()):
        if sense in defs[word]:
            d = defs[word][sense]
        else:
            d = "Not in sense inventory"
        s = ' {0:10s}'.format(sense)
        print(s + ": " + d)
    print("")
Exemplo n.º 2
0
    def process(self, posWords, negWords, posI, negI, tokenizedComm,
        train_file="./resources/train.txt", test_file="./resources/test.txt"):
        comments = self.loadComments()
        (train, test) = self.load_train_test_set(train_file, test_file)

        N = 20
        train_set = []

        for i in train:
            (com, value) = comments[i]
            if value != self.Exc:
                if value < 3:
                    splitted = tokenizedComm[i]
                    train_set.append((self.feature(N, com, splitted, posWords,
                        negWords, posI, negI), "neg"))
                else:
                    splitted = tokenizedComm[i]
                    train_set.append((self.feature(N, com, splitted, posWords,
                        negWords, posI, negI), "pos"))

        # clasificador
        classifier = nltk.NaiveBayesClassifier.train(train_set)

        # Para testing se han excluido comentarios con puntuacion 3, es decir
        # se evaluan comentarios positivos y negativos.

        dev_set = []
        errorComments = []
        refset = collections.defaultdict(set)
        testset = collections.defaultdict(set)
        ref_list = []
        test_list = []

        it = 0
        for i in test:
            (com, value) = comments[i]
            if value != self.Exc:
                it = it + 1
                splitted = tokenizedComm[i]
                evaluate = self.feature(N, com, splitted, posWords, negWords,
                    posI, negI)
                if value < 3:
                    dev_set.append((evaluate, "neg"))
                    refset["neg"].add(it)
                    ref_list.append("neg")
                else:
                    dev_set.append((evaluate, "pos"))
                    refset["pos"].add(it)
                    ref_list.append("pos")
                res = classifier.classify(evaluate)
                testset[res].add(it)
                test_list.append(res)
                if res == "neg":
                    if value < 3:
                        message = "OK"
                    else:
                        message = "ERROR"
                else:
                    if value > 3:
                        message = "OK"
                    else:
                        message = "ERROR"

                if(message == "ERROR"):
                    errorComments.append((com, value))

        classifier.show_most_informative_features(50)

        # confusion matrix
        cm = ConfusionMatrix(ref_list, test_list)
        cm = '\n' + cm.pp(sort_by_count=True, show_percents=False, truncate=9)

        # data metrics
        accuracy = nltk.classify.accuracy(classifier, dev_set)
        pPos = nltk.metrics.precision(refset['pos'], testset['pos'])
        rPos = nltk.metrics.recall(refset['pos'], testset['pos'])
        pNeg = nltk.metrics.precision(refset['neg'], testset['neg'])
        rNeg = nltk.metrics.recall(refset['neg'], testset['neg'])
        Metrics = {'Accuracy': accuracy,
            'Precision Pos': pPos,
            'Recall Pos': rPos,
            'Precision Neg': pNeg,
            'Recall Neg': rNeg,
            'Confusion Matrix': cm}

        for m in sorted(Metrics.keys()):
            print m, Metrics[m]

        num = [accuracy, pPos, rPos, pNeg, rNeg]
        return (Metrics, num)