def cross_validation(featureset,k,tamanhos,dataAugmentation): soma = 0 fmeasurePonderadoMedia = 0; nome_do_dataset = dataAugmentation['nome_do_dataset'] oversampling = dataAugmentation['oversampling'] ind = data_augmentation(nome_do_dataset) qtdNegativos = 0 kf = KFold(n_splits=k) tam = 10 fmeasureDesvio = [] fmeasurePonderadoMedia = [] random.Random().shuffle(featureset) if(oversampling): featuresetComIndice = featureset indicesDoBalanceamento = [] featureset = [] for features in featuresetComIndice: indicesDoBalanceamento.append(features[2]) featureset.append(features[:2]) cont = 0 for train, test in kf.split(featureset): qtdNegativos = tamanhos['qtdNegativos'] refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) train_data = np.array(featureset)[train] test_data = np.array(featureset)[test] if(oversampling): train_augmentation = aumenta_dados(indicesDoBalanceamento,ind,dataAugmentation,train_data) qtdNegativos += len(train_augmentation) train_data = np.concatenate((train_data,train_augmentation),axis=0) random.Random().shuffle(train_data) classifier = nltk.classify.SklearnClassifier(LinearSVC()) # classifier = nltk.NaiveBayesClassifier.train(train_data) classifier.train(train_data) soma += nltk.classify.accuracy(classifier, test_data) for i, (feats, label) in enumerate(test_data): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) fmeasurePositivo = f_measure(refsets['no'], testsets['no']) fmeasureNegativo = f_measure(refsets['yes'], testsets['yes']) # print("Positivo",fmeasurePositivo,tamanhos['qtdPositivos']) # print("Negativo",fmeasureNegativo,qtdNegativos) # print("prox") if(not fmeasurePositivo or not fmeasureNegativo): continue fmeasurePonderadoMedia.append(((fmeasurePositivo*tamanhos['qtdPositivos'])+(fmeasureNegativo*qtdNegativos))/(qtdNegativos+tamanhos['qtdPositivos'])) cont += 1 # average = soma/10 # print(average) # fmeasurePositivoMedia = 2 * (precisionPositivoMedia*recallPositivoMedia)/(precisionPositivoMedia+recallPositivoMedia) # fmeasureNegativoMedia = 2 * (precisionNegativoMedia*recallNegativoMedia)/(precisionNegativoMedia+recallNegativoMedia) # print(cont) fmeasurePonderado = np.mean(fmeasurePonderadoMedia) print(fmeasurePonderado)
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] global cnt cnt += 1 #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [ feature_select(posWords), 'pos' ] #calls make_full_dict and returns a dict with [word,'True'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) #selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', recall(referenceSets['pos'], testSets['pos']) print 'pos f1-score:', f_measure(referenceSets['pos'], testSets['pos']) print 'neg precision:', precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', recall(referenceSets['neg'], testSets['neg']) print 'neg f1-score:', f_measure(referenceSets['neg'], testSets['neg']) classifier.show_most_informative_features(10) print '================================================='
def accuracy_measure(classifier, cross_valid_set): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(cross_valid_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'pos Precision:', precision(refsets[1], testsets[1]) print 'pos Recall:', recall(refsets[1], testsets[1]) print 'pos F-measure:', f_measure(refsets[1], testsets[1]) print 'neg Precision:', precision(refsets[0], testsets[0]) print 'neg Recall:', recall(refsets[0], testsets[0]) print 'neg F-measure:', f_measure(refsets[0], testsets[0])
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] global cnt cnt += 1 # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, "r") as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), "pos"] # calls make_full_dict and returns a dict with [word,'True'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, "r") as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), "neg"] negFeatures.append(negWords) # selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] # trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) # initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) # prints metrics to show how well the feature selection did print "train on %d instances, test on %d instances" % (len(trainFeatures), len(testFeatures)) print "accuracy:", nltk.classify.util.accuracy(classifier, testFeatures) print "pos precision:", precision(referenceSets["pos"], testSets["pos"]) print "pos recall:", recall(referenceSets["pos"], testSets["pos"]) print "pos f1-score:", f_measure(referenceSets["pos"], testSets["pos"]) print "neg precision:", precision(referenceSets["neg"], testSets["neg"]) print "neg recall:", recall(referenceSets["neg"], testSets["neg"]) print "neg f1-score:", f_measure(referenceSets["neg"], testSets["neg"]) classifier.show_most_informative_features(10) print "================================================="
def calculate_metrics(self): included_logs = 0 metrics = {} cc = SmoothingFunction() for identifier in self._values: if self._values[identifier].get('target_text', None) is not None: included_logs += 1 target_text = self._values[identifier]['target_text'] output_text = self._values[identifier]['output_text'] metrics['BLEU'] = metrics.get('BLEU', 0) + sentence_bleu( [target_text], output_text, smoothing_function=cc.method4) metrics['accuracy'] = metrics.get('accuracy', 0) + accuracy( target_text, output_text) target_text = set(target_text) output_text = set(output_text) metrics['precision'] = metrics.get('precision', 0) + precision( target_text, output_text) metrics['recall'] = metrics.get('recall', 0) + recall( target_text, output_text) metrics['f_measure'] = metrics.get('f_measure', 0) + f_measure( target_text, output_text) if included_logs != 0: for metric in metrics: metrics[metric] /= included_logs return metrics, included_logs
def me_classifier(exclude_list): me_classifier = 0 with open(train_data, 'r', encoding='utf-8', errors='ignore') as csvfile: reader = csv.reader(csvfile) feature_set = [(feature_set_generator(text, length, label, exclude_list), label) for text, length, label in reader] #print(feature_set) me_classifier = MaxentClassifier.train(feature_set, "megam") accuracy = 0.0 with open(test_data, 'r', encoding='utf-8', errors='ignore') as testcsvfile: test_reader = csv.reader(testcsvfile) test_feature_set = [(feature_set_generator(text, length, label, exclude_list), label) for text, length, label in test_reader] accuracy = classify.accuracy(me_classifier, test_feature_set) classified = collections.defaultdict(set) observed = collections.defaultdict(set) i = 1 with open(test_data, 'r', encoding='utf-8', errors='ignore') as testcsvfile: test_reader = csv.reader(testcsvfile) for text, length, label in test_reader: observed[label].add(i) classified[me_classifier.classify( feature_set_generator(text, length, label, exclude_list))].add(i) i += 1 return accuracy,precision(observed["1"], classified["1"]),recall(observed['1'], classified['1']),\ f_measure(observed['1'], classified['1']),precision(observed['0'], classified['0']),recall(observed['1'], classified['0']),f_measure(observed['1'], classified['0'])
def evaluate(ref_tags, hyp_tags): if len(ref_tags) != len(hyp_tags): raise ValueError( 'reference and hypothesis has different number of lines') n = len(ref_tags) counter = Counter(ref_tags) unique_tags = set(ref_tags) prec_dict, rec_dict, f_dict = defaultdict(float), defaultdict( float), defaultdict(float) for tag in sorted(unique_tags): ref_ids = {i for i, ref_tag in enumerate(ref_tags) if ref_tag == tag} hyp_ids = {i for i, hyp_tag in enumerate(hyp_tags) if hyp_tag == tag} prec_dict[tag] = precision(ref_ids, hyp_ids) rec_dict[tag] = recall(ref_ids, hyp_ids) f_dict[tag] = f_measure(ref_ids, hyp_ids) if prec_dict[tag] is None: warn(f'Undefined precision for {tag}; converting to 0.0') prec_dict[tag] = 0. if rec_dict[tag] is None: warn(f'Undefined recall for {tag}; converting to 0.0') rec_dict[tag] = 0. if f_dict[tag] is None: warn(f'Undefined F-score for {tag}; converting to 0.0') f_dict[tag] = 0. prec_dict[OVERALL_KEY] += counter[tag] * prec_dict[tag] / n rec_dict[OVERALL_KEY] += counter[tag] * rec_dict[tag] / n f_dict[OVERALL_KEY] += counter[tag] * f_dict[tag] / n return EvalResult(precision=prec_dict, recall=rec_dict, f1=f_dict, conf_matrix=ConfusionMatrix(ref_tags, hyp_tags, sort_by_count=True))
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] #将这些句子分解成单词的列表(由输入机制选择)并在每个列表后附加'pos'或'neg' with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) #选择3/4用于训练和1/4用于测试 posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] #训练朴素贝叶斯分类器 classifier = NaiveBayesClassifier.train(trainFeatures) #启动referenceSets和testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #在referenceSets中放置正确标记的句子,在测试集中放置预测性标记的版本 for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #打印指标以显示特征选择的效果 print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', recall(referenceSets['pos'], testSets['pos']) print 'pos F1:', f_measure(referenceSets['pos'], testSets['pos']) print 'neg precision:', precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', recall(referenceSets['neg'], testSets['neg']) print 'neg F1:', f_measure(referenceSets['pos'], testSets['pos']) classifier.show_most_informative_features(10)
def kset_stat(silvs,golds) : s1 = set(map(to_root,golds)) s2 = set(map(to_root,silvs)) #print(s1,s2) p=precision(s1,s2) r=recall(s1,s2) f=f_measure(s1,s2) if not (p and r and f) : return {'p':0,'r':0,'f':0} return {'p':p,'r':r,'f':f}
def test_trained_classifier(classifier, test_samples): """Prints precision/recall statistics of a NLTK classifier""" import collections refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (sample, label) in enumerate(test_samples): refsets[label].add(i) observed = classifier.classify(sample) testsets[observed].add(i) print("pos precision:", scores.precision(refsets["pos"], testsets["pos"])) print("pos recall:", scores.recall(refsets["pos"], testsets["pos"])) print("pos F-measure:", scores.f_measure(refsets["pos"], testsets["pos"])) print("neg precision:", scores.precision(refsets["neg"], testsets["neg"])) print("neg recall:", scores.recall(refsets["neg"], testsets["neg"])) print("neg F-measure:", scores.f_measure(refsets["neg"], testsets["neg"]))
def testing(sent_classifier): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, category) in enumerate(testing_set): refsets[category].add(i) observed = sent_classifier.classify(feats) testsets[observed].add(i) print ('Classifier Accuracy: ', (nltk.classify.accuracy(sent_classifier, testing_set))*100, "%") print ('Classifier pos Precision:', scores.precision(refsets['pos'], testsets['pos'])*100, "%") print ('Classifier pos Recall:', scores.recall(refsets['pos'], testsets['pos'])*100, "%") print ('Classifier pos F-measure:', scores.f_measure(refsets['pos'], testsets['pos'])*100, "%") print ('Classifier neg Precision:', scores.precision(refsets['neg'], testsets['neg'])*100, "%") print ('Classifier neg Recall:', scores.recall(refsets['neg'], testsets['neg'])*100, "%") print ('Classifier neg F-measure:', scores.f_measure(refsets['neg'], testsets['neg'])*100, "%") print ('\n')
def showResults(self, classif, clasificador): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(self.test_data): refsets[label].add(i) observed = classif.classify(feats) testsets[observed].add(i) print("F1 Score del clasificador:", clasificador, f_measure(refsets['"positive"'], testsets['"positive"']))
def fmeasure(questions_list, batch): f_scores = [] for i, question in enumerate(questions_list): original_quest = batch['question_text'][i] ref_quest = set(original_quest[:-1].split(' ')) gen_quest = set(question.split(' ')) score = f_measure(ref_quest, gen_quest) f_scores.append(float(score)) return f_scores
def classification_result(classifier, test_set): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) reflist = [] testlist = [] for i, (tweet, label) in enumerate(test_set): refsets[label].add(i) reflist.append(label) observed = classifier.classify(tweet) testsets[observed].add(i) testlist.append(observed) print(len(refsets['1'])) print("Accuracy : ", nltk.classify.accuracy(classifier, test_set) * 100) print("Precision Pos: ", precision(refsets['1'], testsets['1']) * 100) print("Recall Pos: ", recall(refsets['1'], testsets['1']) * 100) print("F Measure Pos: ", f_measure(refsets['1'], testsets['1']) * 100) print("Precision Neg: ", precision(refsets['0'], testsets['0']) * 100) print("Recall Neg: ", recall(refsets['0'], testsets['0']) * 100) print("F Measure Neg: ", f_measure(refsets['0'], testsets['0']) * 100) print("Confusion Metrics : \n", ConfusionMatrix(reflist, testlist))
def sentence_fmeasure(self, references, hypothesis): fmeasure_scores = [] hypothesis_set = set(hypothesis) for reference in references: reference_set = set(reference) fmeasure_score = f_measure(reference_set, hypothesis_set) # we calculate f_measure(set(each_reference), set(hypothesis)) score fmeasure_scores.append(fmeasure_score) fmeasure_final_score = max(fmeasure_scores) # we calculate f_measure(set(closest_reference), set(hypothesis)) score return fmeasure_final_score
def get_results(self, classifier, test_set, target): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) target_precision = precision(refsets[target], testsets[target]) target_recall = recall(refsets[target], testsets[target]) target_f_measure = f_measure(refsets[target], testsets[target]) results = (target_precision, target_recall, target_f_measure) return (results)
def avgOffEval(inpath1, inpath2): print('\n=============================') print( 'NER evaluation (single entity class/mention-level, full/offsets, avg. of abstract-level)' ) print('=============================') print('==> gold', inpath1) print('==> pred', inpath2) print('=============================') recs = [] pres = [] fscs = [] for filename1 in glob.glob(inpath1 + "/*ann"): filen1 = filename1.split('/')[len(filename1.split('/')) - 1] for filename2 in glob.glob(inpath2 + "/*ann"): filen2 = filename2.split('/')[len(filename2.split('/')) - 1] if filen1 == filen2: preds = set([]) refrs = set([]) file1 = codecs.open(filename1, 'r', encoding='utf-8') file2 = codecs.open(filename2, 'r', encoding='utf-8') for line1 in file1.readlines(): if len(line1.split('\t')) > 1: men1 = line1.split('\t')[2].strip() off1 = '-'.join([ w.strip() for w in line1.split('\t')[1].split(' ') ]) gold = men1 + '_' + off1 refrs.add(gold) for line2 in file2.readlines(): if len(line2.split('\t')) > 1: men2 = line2.split('\t')[2].strip() off2 = '-'.join([ w.strip() for w in line2.split('\t')[1].split(' ') ]) pred = men2 + '_' + off2 preds.add(pred) if len(preds) > 0 and len(refrs) > 0: rec = scores.recall(refrs, preds) pre = scores.precision(refrs, preds) fsc = scores.f_measure(refrs, preds) else: rec = 0 pre = 0 fsc = 0 recs.append(rec) pres.append(pre) fscs.append(fsc) print('average \t R={R} \t P={P} \t F1={F}'.format(R=str(np.mean(recs)), P=str(np.mean(pres)), F=str(np.mean(fscs)))) print('=============================\n')
def printEval(realSet, testSet): precisionPos = precision(realSet['pos'], testSet['pos']) precisionNeg = precision(realSet['neg'], testSet['neg']) precisionNeutre = precision(realSet['neutre'], testSet['neutre']) recallPos = recall(realSet['pos'], testSet['pos']) recallNeg = recall(realSet['neg'], testSet['neg']) fmesurePos = f_measure(realSet['pos'], testSet['pos']) fmesureNeg = f_measure(realSet['neg'], testSet['neg']) # print("Precision Pos: " + precisionPos + " - Neg: " + float(precisionNeg) # # print("Recall Pos: %f - Neg: %f - Neutral: %f" %(recallPos, recallNeg, recallNeutre)) # # print("F-Mesure Pos: %f - Neg: %f - Neutral: %f" %(fmesurePos, fmesureNeg, fmesureNeutre)) print("Precision Pos: %f - Neg: %f " %(float(precisionPos), float(precisionNeg))) print("Recall Pos: %f - Neg: %f " %(float(recallPos), float(recallNeg))) print("F-Mesure Pos: %f - Neg: %f " %(float(fmesurePos), float(fmesureNeg)))
def main(command, classifier_type): feature_functions = [unigram_freqs] corpus_file = open('ratings_corpus.json') corpus = json.load(corpus_file) corpus_file.close() feature_representation = [(extract_features(document, feature_functions), label) for document, label in corpus] train_set, test_set = split_data(feature_representation) classifier = '' if command == 'new': if classifier_type == 'decision_tree': classifier = nltk.classify.DecisionTreeClassifier.train(train_set) elif classifier_type == 'maxent': classifier = nltk.classify.maxent.MaxentClassifier.train(train_set) elif command == 'load': if classifier_type == 'decision_tree': classifier_file = open('decisiontree_classifier.pickle', 'rb') classifier = pickle.load(classifier_file) classifier_file.close() elif classifier_type == 'maxent': classifier_file = open('maxent_classifier.pickle', 'rb') classifier = pickle.load(classifier_file) classifier_file.close() predictions = [] golds = [] for test_doc, rating in test_set: predictions.append(classifier.classify(test_doc)) golds.append(rating) pred_sets = initialize_sets(ALL_RATINGS) gold_sets = initialize_sets(ALL_RATINGS) for doc_id, rating in enumerate(predictions): pred_sets[rating].add(doc_id) for doc_id, rating in enumerate(golds): gold_sets[rating].add(doc_id) for label in ALL_RATINGS: r = scores.recall(gold_sets[label], pred_sets[label]) p = scores.precision(gold_sets[label], pred_sets[label]) f = scores.f_measure(gold_sets[label], pred_sets[label]) if not (r==None or p==None or f==None): f = float(f) print('<{}> P: {:.2}, R: {:.2}, F: {:.2}'.format(label, p, r, f))
def assign_clusters_to_works(trials): dest_file = "results/mca_assignments.csv" similarity = wup T = 0.76 cluster_types = ['clusters'] for (tag_col_prefix, use_only_n_tags) in [('user', 6), ('machine', 25)]: for cluster_type in cluster_types: print(' *', 'processing type: {} for {}'.format(tag_col_prefix, cluster_type)) with codecs.open(f'data/{cluster_type}.json', 'rb', 'utf-8') as f_clusters: clusters = preprocess_clusters(json.loads(f_clusters.read())) tags_to_clusters( clusters, trials, t=T, similarity=similarity, tag_col_prefix=tag_col_prefix, cluster_type=cluster_type, use_only_n_tags=use_only_n_tags) # # score results # for work in trials: for cluster_type in cluster_types: machine = "{}_{}_{}".format('machine', cluster_type, "no_scores") human = "{}_{}_{}".format('user', cluster_type, "no_scores") work["{}_fmeasure".format(cluster_type)] = f_measure(set(work['user_aggrement']), set(work[machine])) df = pd.DataFrame(trials) df.drop(columns=['user_tags_synsets', 'machine_tags_synsets'], inplace=True) # move some columns to front cols = df.columns.tolist() for col in ['user_tags', 'machine_tags', 'title', 'description', 'artist_name']: cols.insert(0, cols.pop(cols.index(col))) df = df.reindex(columns=cols) df.to_csv(dest_file, index=False) print(' *', 'written file: {}'.format(dest_file)) for j, cluster_type in enumerate(cluster_types): s = df['{}_fmeasure'.format(cluster_type)] print(cluster_type, 'mean f-measure:', s.mean(), 'hit percentage:', 100 * s.where(s > 0).count() / len(s) )
def assess_classifier(classifier, test_set): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) count = 0 print('Precision = ' + str(precision(refsets['spam'], testsets['spam']))) print('Recall = ' + str(recall(refsets['spam'], testsets['spam']))) print('F measure = ' + str(f_measure(refsets['spam'], testsets['spam'], alpha=0.5))) print('FP rate = ' + str( abs((len(refsets['ham']) - len(testsets['ham'])) / (len(refsets['spam']) + len(refsets['ham'])))))
def main(): brown_tagged_sents = brown.tagged_sents(categories='news') size = int(len(brown_tagged_sents) * 0.8) train_data = brown_tagged_sents[:size] test_data = brown_tagged_sents[size:] # store pickle file if not (os.path.isfile('UnigramTagger.pkl') and os.path.isfile('Tnt_Tagger.pkl') and os.path.isfile('PerceptronTagger.pkl')): unigram_tagger = unigram_tag(train_data) tnt_tagger = tnt_tag(train_data) perc_tagger = perceptron_tag(train_data) [store_pickle(each_) for each_ in [unigram_tagger, tnt_tagger, perc_tagger]] # load pickle file and get each model file with a tuple models_files_tuple = [(each_.split('.')[0], retrieve_pickle(each_)) for each_ in ['UnigramTagger.pkl', 'PerceptronTagger.pkl', 'Tnt_Tagger.pkl']] # test the loaded models on test data print("TESTING LOADED MODELS") for tagg_name, tagg_mode in models_files_tuple: print("Loaded {tag_name} evaluation results: {evaluate_res}".format(tag_name=tagg_name, evaluate_res=tagg_mode.evaluate(test_data))) # Tabulate and calculate accuracies, choose best one based on F1 value reference_sentences_lists = [list(map(lambda pair_: pair_[1], each)) for each in test_data] test_sentences_lists = [list(map(lambda pair_: pair_[0], each)) for each in test_data] reference_lst = list() test_lst = list() [reference_lst.extend(each_lst) for each_lst in reference_sentences_lists[:1000]] [test_lst.extend(each_lst) for each_lst in test_sentences_lists[:1000]] for tagg_name, tagger_mod in models_files_tuple: if tagg_name == "Tnt_Tagger": reference_lst = reference_lst[:700] test_lst = test_lst[:700] result_tokens = tagger_mod.tag(test_lst) result_tokens__ = list(map(lambda pair: 'UNKNOWN' if pair[1] is None else pair[1], result_tokens)) print("{} Evaluation Results".format(tagg_name)) print("Precision: ", precision(set(reference_lst), set(result_tokens__))) print("Recall: ", recall(set(reference_lst), set(result_tokens__))) print("F measure: ", f_measure(set(reference_lst), set(result_tokens__)))
def compute_evaluation_scores(classifier: ClassifierBase, data_set: List[Tuple[Dict, str]], evaluated_class: LikeTypeEnum) \ -> Dict[str, float]: """Evaluate classifier on dataset with common metrics. Namely calculates: precision, recall, accuracy, f-measure. And adds: tp, fp, np, tn (true/false positives/negatives).""" clas_scores: dict = {} correctly_classified: int = 0 # metrics refsets: DefaultDict[str, set] = defaultdict(set) testsets: DefaultDict[str, set] = defaultdict(set) for i, (fs, label) in enumerate(data_set): refsets[label].add(i) classified = classifier.classify(fs) testsets[classified].add(i) if label == classified: correctly_classified += 1 # we don't know how many and what are the values of negative classes # therefore we compute union of all and subtract positive elements negative_test: set = reduce(lambda a, b: a.union(b), testsets.values()) \ - testsets[evaluated_class.value] negative_ref: set = reduce(lambda a, b: a.union(b), refsets.values()) \ - refsets[evaluated_class.value] positive_test: set = testsets[evaluated_class.value] positive_ref: set = refsets[evaluated_class.value] clas_scores['tp'] = len(positive_test & positive_ref) / len(data_set) clas_scores['fp'] = len(positive_test & negative_ref) / len(data_set) clas_scores['tn'] = len(negative_test & negative_ref) / len(data_set) clas_scores['fn'] = len(negative_test & positive_ref) / len(data_set) clas_scores['precision'] = scores.precision(positive_ref, positive_test) clas_scores['recall'] = scores.recall(positive_ref, positive_test) clas_scores['f_measure'] = scores.f_measure(positive_ref, positive_test) # accuracy is true positives and true negatives over all instances clas_scores['accuracy'] = correctly_classified / len(data_set) return clas_scores
def run_baseline(): gold_filter = [] with open(FEATURE_CSV, "r") as csvfile: reader = csv.DictReader(csvfile) for row in reader: gold_filter += [int(row["book_id"])] clusters = [[], []] for id in gold_filter: clusters[random.randint(0, len(clusters) - 1)] += [id] f_score, recall, precision = score_clusters(clusters, get_gold_standard(gold_filter)) print "%s,%s,%s" % (f_score, recall, precision) f_score, recall, precision = score_clusters([clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter)) gold_standard = get_gold_standard(gold_filter) print "f-score:", f_measure( [set(clusters[0] + clusters[1]), set([])], [set(gold_standard[0]), set(gold_standard[1])] ) print "%s,%s,%s" % (f_score, recall, precision)
def find_scores(): #Text formatting to classify def format_text(text): return ({word: True for word in nltk.word_tokenize(text)}) #Load positive categorized text pos = [] with open("./pos.txt", encoding='ISO-8859-1') as f: for i in f: pos.append([ format_text(i.encode("utf-8").decode("unicode-escape")), 'positive' ]) #Load negative categorized text neg = [] with open("./neg.txt", encoding='ISO-8859-1') as f: for i in f: neg.append([ format_text(i.encode("utf-8").decode("unicode-escape")), 'negative' ]) #Load negative categorized text neu = [] with open("./neu.txt", encoding='ISO-8859-1') as f: for i in f: neu.append([ format_text(i.encode("utf-8").decode("unicode-escape")), 'neutre' ]) #Split data into training(80%) and testing(20%) sets training_set = pos[:int((.80) * len(pos))] + neg[:int( (.80) * len(neg))] + neu[:int((.80) * len(neu))] test_set = pos[int((.80) * len(pos)):] + neg[int( (.80) * len(neg)):] + neu[int((.80) * len(neu)):] #Training classifier classifier = NaiveBayesClassifier.train(training_set) #Calculate scores trueset = collections.defaultdict(set) testset = collections.defaultdict(set) #Test all test-set items using defined classifier for i, (text, label) in enumerate(test_set): trueset[label].add(i) result = classifier.classify(text) testset[result].add(i) #accurays return accuracy(classifier, test_set), f_measure( trueset['positive'], testset['negative']), f_measure( testset['negative'], trueset['positive']), f_measure( testset['neutre'], trueset['positive']), f_measure( testset['positive'], trueset['neutre']), f_measure( testset['negative'], trueset['neutre']), f_measure(testset['neutre'], trueset['negative'])
def precision_recall_F_Measure(classifier, testfeats): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) precisions = {} recalls = {} f = {} for label in classifier.labels(): precisions[label] = precision(refsets[label], testsets[label]) recalls[label] = recall(refsets[label], testsets[label]) f[label] = f_measure(refsets[label], testsets[label]) return precisions, recalls, f
def macroOffEval(inpath1, inpath2): print('\n=============================') print( 'NER evaluation (single entity class/mention-level, full/offsets, corpus-level)' ) print('=============================') print('==> gold', inpath1) print('==> pred', inpath2) print('=============================') preds = set([]) refrs = set([]) for filename1 in glob.glob(inpath1 + "/*ann"): filen1 = filename1.split('/')[len(filename1.split('/')) - 1] for filename2 in glob.glob(inpath2 + "/*ann"): filen2 = filename2.split('/')[len(filename2.split('/')) - 1] if filen1 == filen2: file1 = codecs.open(filename1, 'r', encoding='utf-8') file2 = codecs.open(filename2, 'r', encoding='utf-8') for line1 in file1.readlines(): if len(line1.split('\t')) > 1: men1 = line1.split('\t')[2].strip() off1 = '-'.join([ w.strip() for w in line1.split('\t')[1].split(' ') ]) gold = men1 + '_' + off1 refrs.add(gold) for line2 in file2.readlines(): if len(line2.split('\t')) > 1: men2 = line2.split('\t')[2].strip() off2 = '-'.join([ w.strip() for w in line2.split('\t')[1].split(' ') ]) pred = men2 + '_' + off2 preds.add(pred) rec = scores.recall(refrs, preds) pre = scores.precision(refrs, preds) fsc = scores.f_measure(refrs, preds) print('macro \t R={R} \t P={P} \t F1={F}'.format(R=str(rec), P=str(pre), F=str(fsc))) print('=============================\n')
def show_metrics(classifier, test_set): description = "" # Given a classifier and a set to test it, it will print metrics for the classifier description = description + "\n" + "Accuracy: " + str( nltk.classify.accuracy(classifier, test_set)) # Creates two sets: one with references (correct results) and other with tests (classifier predictions) # This sets are divided in fact-checkable and non-fact-checkable sets that contain a unique id (integer) # for each sentence refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) # 1, neg observed = classifier.classify(feats) #neg testsets[observed].add(i) #1, neg model_precision = int( precision(refsets['fact-checkable'], testsets['fact-checkable']) * 100) model_recall = int( recall(refsets['fact-checkable'], testsets['fact-checkable']) * 100) model_f_measure = int( f_measure(refsets['fact-checkable'], testsets['fact-checkable'], 0.3) * 100) description += "\n" + "PRECISION: Of the sentences predicted fact-checkable, " + str( model_precision) + "% were actually fact-checkable" description += "\n" + "RECALL: Of the sentences that were fact-checkable, " + str( model_recall) + "% were predicted correctly" description += "\n" + "F-MEASURE (balance between precission and recall): " + str( model_f_measure) + "%" # Same for non fact-checkables #print('non-fact-checkable precision:', precision(refsets['non-fact-checkable'], testsets['non-fact-checkable'])) #print('non-fact-checkable recall:', recall(refsets['non-fact-checkable'], testsets['non-fact-checkable'])) #print('non-fact-checkable F-measure:', f_measure(refsets['non-fact-checkable'], testsets['non-fact-checkable'])) print(description) # informative classifier.show_most_informative_features(25) return description
def get_measures(reference, test): tp = tn = fp = fn = 0 for ((_, r), (_, t)) in zip(reference, test): if r == t == "O": tn += 1 elif r == t == "ORG": tp += 1 elif r == "O" and t == "ORG": fp += 1 elif r == "ORG" and t == "O": fn += 1 matrix = [tp, tn, fp, fn] acc = accuracy(reference, test) reference_set = set(reference) test_set = set(test) pre = precision(reference_set, test_set) rec = recall(reference_set, test_set) f = f_measure(reference_set, test_set) return acc, pre, rec, f, matrix
def run_baseline(): gold_filter = [] with open(FEATURE_CSV, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: gold_filter += [int(row['book_id'])] clusters = [[], []] for id in gold_filter: clusters[random.randint(0, len(clusters) - 1)] += [id] f_score, recall, precision = score_clusters(clusters, get_gold_standard(gold_filter)) print "%s,%s,%s" % (f_score, recall, precision) f_score, recall, precision = score_clusters( [clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter)) gold_standard = get_gold_standard(gold_filter) print "f-score:", f_measure( [set(clusters[0] + clusters[1]), set([])], [set(gold_standard[0]), set(gold_standard[1])]) print "%s,%s,%s" % (f_score, recall, precision)
def get_performance_dataframe(tagger, test_tag_list): """Returns DataFrame with metrics for individual tag combinations. For NLTK taggers.""" truth_sets = defaultdict(set) test_sets = defaultdict(set) for n, (w, label) in enumerate(test_tag_list): observed = tagger.tag([w])[0][1] truth_sets[label].add(n) test_sets[observed].add(n) performance_dict = dict() for key in test_sets.keys(): performance_dict.setdefault( key, { 'Precision': precision(truth_sets[key], test_sets[key]), 'Recall': recall(truth_sets[key], test_sets[key]), 'F1': f_measure(truth_sets[key], test_sets[key]) } ) df = pd.DataFrame(performance_dict).T return df
def compute_pairwise(hashed_er_anns_df): """ Returns pairwise comparision between users (uesr_a & user_b) that have completed similar documents """ # Make user_pks unique userset = set(hashed_er_anns_df.user_id) inter_annotator_arr = [] # For each unique user comparision, compute for user_a, user_b in itertools.combinations(userset, 2): # The list of document_pks that each user had completed user_a_set = set(hashed_er_anns_df[hashed_er_anns_df['user_id'] == user_a].document_pk) user_b_set = set(hashed_er_anns_df[hashed_er_anns_df['user_id'] == user_b].document_pk) # Only compare documents both users have completed pmid_set = user_a_set.intersection(user_b_set) # If user_a and user_b have completed shared PMID, compute comparisions if len(pmid_set) != 0: pmid_df = hashed_er_anns_df[hashed_er_anns_df['document_pk'].isin( pmid_set)] ref_set = set(pmid_df[pmid_df['user_id'] == user_a].hash) test_set = set(pmid_df[pmid_df['user_id'] == user_b].hash) # Compute the precision, recall and F-measure based on # the unique hashes inter_annotator_arr.append( (user_a, user_b, len(pmid_set), nltk_scoring.precision(ref_set, test_set), nltk_scoring.recall(ref_set, test_set), nltk_scoring.f_measure(ref_set, test_set))) return pd.DataFrame(inter_annotator_arr, columns=('user_a', 'user_b', 'docs_compared', 'precision', 'recall', 'f-score'))
def scores(classifier, test, ids): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.accuracy(classifier, test) print("accuracy: " + str(accuracy)) p = filter(partial(is_not, None), [precision(refsets[sense], testsets[sense]) for sense in ids]) p = sum(p) / len(p) print("precision: " + str(p)) r = filter(partial(is_not, None), [recall(refsets[sense], testsets[sense]) for sense in ids]) r = sum(r) / len(r) print("recall: " + str(r)) f_1 = filter(partial(is_not, None), [f_measure(refsets[sense], testsets[sense]) for sense in ids]) f_1 = sum(f_1) / len(f_1) print("f-1 score: " + str(f_1)) return ({"precision": p, "recall": r, "f_1": f_1, "accuracy": accuracy})
def compute_pairwise(hashed_annotations_df): ''' Returns pairwise comparision between users (uesr_a & user_b) that have completed similar documents ''' # Make user_pks unique userset = set(hashed_annotations_df.user) inter_annotator_arr = [] # For each unique user comparision, compute for user_a, user_b in itertools.combinations(userset, 2): # The list of document_ids that each user had completed user_a_set = set(hashed_annotations_df[hashed_annotations_df['user'] == user_a].document_id) user_b_set = set(hashed_annotations_df[hashed_annotations_df['user'] == user_b].document_id) # Only compare documents both users have completed pmid_set = user_a_set.intersection(user_b_set) # If user_a and user_b have completed shared PMID, compute comparisions if len(pmid_set) != 0: pmid_df = hashed_annotations_df[hashed_annotations_df['document_id'].isin(pmid_set)] ref_set = set(pmid_df[pmid_df['user'] == user_a].hash) test_set = set(pmid_df[pmid_df['user'] == user_b].hash) # Compute the precision, recall and F-measure based on # the unique hashes inter_annotator_arr.append(( user_a, user_b, len(pmid_set), nltk_scoring.precision(ref_set, test_set), nltk_scoring.recall(ref_set, test_set), nltk_scoring.f_measure(ref_set, test_set) )) return pd.DataFrame(inter_annotator_arr, columns=('user_a', 'user_b', 'docs_compared', 'precision', 'recall', 'f-score'))
def f_measure(self, label, alpha=0.5): return scores.f_measure(self._referenceSets[label], \ self._testSets[label], alpha)
model.class_prior = [1-categorized_proportion, categorized_proportion] else: model.class_prior = [categorized_proportion, 1-categorized_proportion] classifier.train(train_set) # test classifier test_results = classifier.classify_many([feat for (feat, label) in test_set]) pos_test_set = set(i for i, result in enumerate(test_results) if result == category) reference_values = [label for (feat, label) in test_set] pos_ref_set = set(i for i, (feat, label) in enumerate(test_set) if label == category) accuracy = scores.accuracy(reference_values, test_results) accuracies.append(accuracy) precision = scores.precision(pos_ref_set, pos_test_set) recall = scores.recall(pos_ref_set, pos_test_set) f1 = scores.f_measure(pos_ref_set, pos_test_set) f1_scores.append(f1) print "%s: accuracy %s, precision %s, recall %s, F1 %s" % (colored(category, "blue"), colored(accuracy, "yellow"), colored(precision, "yellow"), colored(recall, "yellow"), colored(f1, "yellow")) ## print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # print "" # save trained classifier and word features to file dump_file = open("classifiers/%s.pickle" % category, "wb") pickle.dump({ "classifier": classifier, "word_features": word_features }, dump_file) dump_file.close()
def getFMeasure(self): return f_measure(self._refsets['POS'], self._testsets['POS']);
def test_iteration(i, train_set, test_dict, feature_sets_by_match, classifier_type='decision_tree'): """Performs one iteration of the k-fold cross validation, returing a dict containing overall micro and macro score averages, in addition to scores for each label. Args: i: the iteration of the k-fold cross validation. train_set: a list containing feature, rating pairs test_dict: a dicitonary containing feature and rating information for the test set. feature_sets_by_match: feature respresentations of documents organized by match. classifier_type: the type of classifier to use. Returns: A dict containing overall micro and macro score averages, in addition to scores for each label. """ classifier = '' if classifier_type == 'decision_tree': #classifier = nltk.classify.DecisionTreeClassifier.train(train_set) classifier = nltk.classify.scikitlearn.SklearnClassifier(tree.DecisionTreeClassifier(random_state=8246)).train(train_set) elif classifier_type == 'maxent': #classifier = nltk.classify.maxent.MaxentClassifier.train(train_set) classifier = nltk.classify.scikitlearn.SklearnClassifier(linear_model.LogisticRegression()).train(train_set) elif classifier_type == 'svr': classifier = nltk.classify.scikitlearn.SklearnClassifier(svm.SVR()).train(train_set) pred_sets = initialize_sets(ALL_RATINGS) gold_sets = initialize_sets(ALL_RATINGS) pred_list = [] gold_list = [] # Classify predictions and add them to relevant dicts and lists. for match in test_dict: for doc_id in test_dict[match]: test_doc = test_dict[match][doc_id]['features'] pred = classifier.classify(test_doc) gold = test_dict[match][doc_id]['gold'] test_dict[match][doc_id]['pred'] = pred gold_list.append(str(gold)) pred_list.append(str(pred)) gold_sets[gold].add(doc_id) pred_sets[pred].add(doc_id) # Calculate pairwise ranking accuracy correct= 0 total = 0 for match in test_dict: for pl1, pl2 in combinations(test_dict[match].keys(), 2): p1 = test_dict[match][pl1] p2 = test_dict[match][pl2] if p1['gold'] > p2['gold'] and p1['pred'] > p2['pred']: correct += 1 elif p1['gold'] < p2['gold'] and p1['pred'] < p2['pred']: correct += 1 elif p1['gold'] == p2['gold'] and p1['pred'] == p2['pred']: correct += 1 total += 1 print('Pairwise ranking accuracy: ' + str(correct/total)) fold_scores = {'micro': '', 'macro': '', 'by_label': {rating: {'p': 0, 'r': 0, 'f': 0} for rating in ALL_RATINGS} } prf_micro = precision_recall_fscore_support(gold_list, pred_list, average='micro') print(prf_micro) fold_scores['micro'] = prf_micro prf_macro = precision_recall_fscore_support(gold_list, pred_list, average='macro') print(prf_macro) fold_scores['macro'] = prf_macro for label in ALL_RATINGS: r = scores.recall(gold_sets[label], pred_sets[label]) p = scores.precision(gold_sets[label], pred_sets[label]) f = scores.f_measure(gold_sets[label], pred_sets[label]) if r == None: r = 0.0 if p == None: p = 0.0 if f == None: f = 0.0 fold_scores['by_label'][label]['p'] = p fold_scores['by_label'][label]['r'] = r fold_scores['by_label'][label]['f'] = f f = float(f) print('<{}> P: {:.3}, R: {:.3}, F: {:.3}'.format(label, p, r, f)) return fold_scores