def bigram_acc(transitions): """ Compute the bigram overlap (accuracy) for a list of predicted Transitions. transitions -- A list of discourse.hypergaph.Transition objects. returns bigram overlap (accuracy) """ ntrans = len(transitions) # Get predicted bigrams. pred_bg = set([(s2i(t.sentences[1]), s2i(t.sentences[0], end='end')) for t in recover_order(transitions)]) # Create gold bigrams. gold = set([(i, i+1) for i in range(-1, ntrans - 2)]) gold.add((ntrans - 2, 'end')) # If either sets are empty return None. if len(pred_bg) == 0 or len(gold) == 0: return None nbigrams = len(gold) acc = len(pred_bg & gold) / float(nbigrams) return acc
def oso_acc(transitions): ntrans = len(transitions) # Get predicted bigrams. pred = [s2i(t.sentences[0], end=ntrans-1) for t in recover_order(transitions)] if tuple(pred) == tuple([i for i in range(ntrans)]): return 1 else: return 0
def kendalls_tau(transitions): """ Compute Kendall's tau and pvalue for a list of discourse.hypergraph.Transition objects. transitions -- A list of discourse.hypergaph.Transition objects. returns (kt, pval) """ # Get list sentence indices implied by the transition set. indices = [s2i(t.sentences[0]) for t in recover_order(transitions)[:-1]] # Get gold indices. gold = [i for i in range(len(indices))] # Compute Kendall's tau for these two sequences. kt, pval = sp.stats.kendalltau(indices, gold) return kt, pval
print_model_metrics(cutoff_trainY, cutoff_testY) for i, datum in enumerate(izip(testX, gtestY[0:20], ptestY), 1): print u'TEST NO: {:3}\n============\n'.format(i) testx, gtesty, ptesty = datum kt, pval = evaluation.kendalls_tau(ptesty) print u'Kendall\'s Tau : {:.3f} (pval {:.3f})'.format(kt, pval) print u'Bigram Acc. : {:.3f}'.format(evaluation.bigram_acc(ptesty)) print print u'GOLD ORDERING\n==================\n' print unicode(testx.trans2str(gtesty)) print for t in hypergraph.recover_order(gtesty): print u'TRANSITION: {}'.format(unicode(t)) print u'=' * 79 idx1 = hypergraph.s2i(t.sents[1]) sent1 = testx[idx1] if idx1 > -1 else 'START' idx2 = hypergraph.s2i(t.sents[0]) sent2 = testx[idx2] if idx2 is not None else 'END' print textwrap.fill(u'({:3}) {}'.format(idx1, unicode(sent1))) print u' |\n V' print textwrap.fill(u'({:3}) {}\n'.format(idx2, unicode(sent2))) evaluation.explain_transition(t, model, testx) print
def eval_against_baseline(testX, baselineY, newY, baseline_model, new_model, base_feats, new_feats, baseline_pred_trainY=None, new_pred_trainY=None): """ Evaluate differences in two models. Prints out per instance analysis of transitions predicted by baseline and new models. testX -- A list of corenlp.Document objects to evaluate on. baselineY -- A list of lists of discourse.hypergraph.Transition objects predicted by the baseline model for the documents in testX. newY -- A list of lists of discourse.hypergraph.Transition objects predicted by the new model for the documents in testX. baseline_model -- A discourse.perceptron.Perceptron object trained on the features in base_feats. new_model -- A discourse.perceptron.Perceptron object trained on the features in new_feats. base_feats -- A dict of feature names to boolean values, indicating the features active in the baseline model. new_feats -- A dict of feature names to boolean values, indicating the features active in the new model. """ # Limit text output to 80 chars and wrap nicely. wrapper = textwrap.TextWrapper(subsequent_indent='\t') print u'OVERALL STATS FOR TEST DOCUMENTS' # Print macro averaged Kendall's Tau and pvalues for baseline # and new model. bl_avg_kt, bl_avg_pval = avg_kendalls_tau(baselineY) new_avg_kt, new_avg_pval = avg_kendalls_tau(newY) print u'\t | BASELINE | NEW' print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(u'Kendalls Tau', bl_avg_kt, bl_avg_pval, new_avg_kt, new_avg_pval) # Print bigram gold sequence overlap (accuracy) for baseline and # new model. bl_bg_acc = mac_avg_bigram_acc(baselineY) new_bg_acc = mac_avg_bigram_acc(newY) print u'\t | BASELINE | NEW' print u'{:12} | {:.3f} | {:.3f} \n'.format(u'bigram acc', bl_bg_acc, new_bg_acc) if baseline_pred_trainY is not None or new_pred_trainY is not None: if baseline_pred_trainY is not None: bl_avg_kt_train, bl_avg_pval_train = avg_kendalls_tau( baseline_pred_trainY) bl_bg_acc_train = mac_avg_bigram_acc(baseline_pred_trainY) else: bl_avg_kt_train = float('nan') bl_avg_pval_train = float('nan') bl_bg_acc_train = float('nan') if new_pred_trainY is not None: new_avg_kt_train, new_avg_pval_train = avg_kendalls_tau( new_pred_trainY) new_bg_acc_train = mac_avg_bigram_acc(new_pred_trainY) else: new_avg_kt_train = float('nan') new_avg_pval_train = float('nan') new_bg_acc_train = float('nan') print u'OVERALL STATS FOR TRAINING DOCUMENTS' print u'\t | BASELINE | NEW' print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format( u'Kendalls Tau', bl_avg_kt_train, bl_avg_pval_train, new_avg_kt_train, new_avg_pval_train) print u'\t | BASELINE | NEW' print u'{:12} | {:.3f} | {:.3f} \n'.format(u'bigram acc', bl_bg_acc_train, new_bg_acc_train) # Print stats for individual test instances. for test_idx, datum in enumerate(izip(testX, baselineY, newY), 1): testx, baseliney, newy = datum print u'TEST NO. {:4}\n=============\n'.format(test_idx) # Print Kendalls Tau and pvalue for baseline and new model # for this test instance. bl_kt, bl_pval = kendalls_tau(baseliney) new_kt, new_pval = kendalls_tau(newy) print u'\t | BASELINE | NEW' print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(u'K. Tau', bl_kt, bl_pval, new_kt, new_pval) # Print bigram gold sequence overlap (accuracy) for baseline # and new model. bl_acc = bigram_acc(baseliney) new_acc = bigram_acc(newy) print u'\t | BASELINE | NEW' print u'{:12} | {:.3f} | {:.3f} \n'.format(u'bigram acc', bl_acc, new_acc) # Print document sentences in correct order. print u'GOLD TEXT\n=========\n' for i, s in enumerate(testx): print wrapper.fill(u'({:3}) {}'.format(i, unicode(s))) print u'\n\n' # Print document sentences in baseline order. print u'BASELINE TEXT\n=========\n' indices = [s2i(t.sents[0]) for t in recover_order(baseliney)[:-1]] for i in indices: print wrapper.fill(u'({}) {}'.format(i, unicode(testx[i]))) print u'\n\n' # Print document sentences in new model order. print u'NEW MODEL TEXT\n=========\n' indices = [s2i(t.sents[0]) for t in recover_order(newy)[:-1]] for i in indices: print wrapper.fill(u'({}) {}'.format(i, unicode(testx[i]))) print u'\n\n' # Get predicted transitions in order for both models. # NOTE: The predict function of the Perceptron object returns # the predicted transitions in no particular order. # When in doubt, use recover_order on any predicted output # if you want to iterate over it as if you were traversing the # graph of sentence transitions. baseline_trans = discourse.hypergraph.recover_order(baseliney) new_trans = discourse.hypergraph.recover_order(newy) # Map tail sentence of a transition to the transition. p2t_baseline = _position2transition_map(baseline_trans) p2t_new = _position2transition_map(new_trans) # For each transition leaving the same sentence, if the models # disagree on what the next sentence is, print analysis of # the model features. for pos, t_bl in p2t_baseline.items(): if p2t_new[pos].sents[0] != t_bl.sents[0]: t_new = p2t_new[pos] # Print tail sentence. if pos > -1: pos_str = unicode(testx[pos]) else: pos_str = u'START' print u'=' * 80 print wrapper.fill(u'({:3}) {}'.format(pos, pos_str)) print (u'-' * 80) print u' |\n V' # Print baseline head sentence if s2i(t_bl.sents[0]) is not None: bl_str = unicode(testx[s2i(t_bl.sents[0])]) else: bl_str = u'END' print wrapper.fill(u'(OLD) {}\n'.format(bl_str)) + u'\n' # Print baseline model features for the predicted # baseline transition. explain(t_bl, baseline_model, new_model, testx, base_feats, new_feats) # Print new model head sentence. if s2i(t_new.sents[0]) is not None: new_str = unicode(testx[s2i(t_new.sents[0])]) else: new_str = 'END' print wrapper.fill(u'(NEW) {}\n'.format(new_str)) + u'\n' # Print new model features for the predicted new # model transition. explain(t_new, baseline_model, new_model, testx, base_feats, new_feats) # Print gold head sentence, that is, the sentence the # models should have selected. if pos + 1 < len(testx): gstr = u'(GLD) {}\n'.format(unicode(testx[pos + 1])) print wrapper.fill(gstr) + u'\n' if pos + 1 == s2i(t_bl.sents[0], end=len(testx)): print 'OLD MODEL IS CORRECT\n' if pos + 1 == s2i(t_new.sents[0], end=len(testx)): print 'NEW MODEL IS CORRECT\n' print