for ins in testInstances: gold[ins[0]] = ins[1] # READ PREDICTIONS filename = word + ".f"+str(fold)+"."+class_name+"."+fs+".out" f = open(results_dir + "/" + word + "/" + filename, "r") for ins in f: ins = ins.rstrip() (insId, sense) = ins.split("\t") preds[insId] = sense f.close() p_senses = [preds[insId] for insId in sorted(preds.keys())] g_senses = [gold[insId] for insId in sorted(gold.keys())] cm = ConfusionMatrix(g_senses, p_senses) print("") try: print(cm.pp(sort_by_count=True, show_percents=percents, truncate=9)) except: print(cm.pretty_format(sort_by_count=True, show_percents=percents, truncate=9)) defs = utils.read_definitions(defpath) for sense in sorted(defs[word].keys()): if sense in defs[word]: d = defs[word][sense] else: d = "Not in sense inventory" s = ' {0:10s}'.format(sense) print(s + ": " + d) print("")
def process(self, posWords, negWords, posI, negI, tokenizedComm, train_file="./resources/train.txt", test_file="./resources/test.txt"): comments = self.loadComments() (train, test) = self.load_train_test_set(train_file, test_file) N = 20 train_set = [] for i in train: (com, value) = comments[i] if value != self.Exc: if value < 3: splitted = tokenizedComm[i] train_set.append((self.feature(N, com, splitted, posWords, negWords, posI, negI), "neg")) else: splitted = tokenizedComm[i] train_set.append((self.feature(N, com, splitted, posWords, negWords, posI, negI), "pos")) # clasificador classifier = nltk.NaiveBayesClassifier.train(train_set) # Para testing se han excluido comentarios con puntuacion 3, es decir # se evaluan comentarios positivos y negativos. dev_set = [] errorComments = [] refset = collections.defaultdict(set) testset = collections.defaultdict(set) ref_list = [] test_list = [] it = 0 for i in test: (com, value) = comments[i] if value != self.Exc: it = it + 1 splitted = tokenizedComm[i] evaluate = self.feature(N, com, splitted, posWords, negWords, posI, negI) if value < 3: dev_set.append((evaluate, "neg")) refset["neg"].add(it) ref_list.append("neg") else: dev_set.append((evaluate, "pos")) refset["pos"].add(it) ref_list.append("pos") res = classifier.classify(evaluate) testset[res].add(it) test_list.append(res) if res == "neg": if value < 3: message = "OK" else: message = "ERROR" else: if value > 3: message = "OK" else: message = "ERROR" if(message == "ERROR"): errorComments.append((com, value)) classifier.show_most_informative_features(50) # confusion matrix cm = ConfusionMatrix(ref_list, test_list) cm = '\n' + cm.pp(sort_by_count=True, show_percents=False, truncate=9) # data metrics accuracy = nltk.classify.accuracy(classifier, dev_set) pPos = nltk.metrics.precision(refset['pos'], testset['pos']) rPos = nltk.metrics.recall(refset['pos'], testset['pos']) pNeg = nltk.metrics.precision(refset['neg'], testset['neg']) rNeg = nltk.metrics.recall(refset['neg'], testset['neg']) Metrics = {'Accuracy': accuracy, 'Precision Pos': pPos, 'Recall Pos': rPos, 'Precision Neg': pNeg, 'Recall Neg': rNeg, 'Confusion Matrix': cm} for m in sorted(Metrics.keys()): print m, Metrics[m] num = [accuracy, pPos, rPos, pNeg, rNeg] return (Metrics, num)