writer.writerow([unicode(x) for x in ['id'] + [x[0] for x in labels] + ['diff', 'snippet']]) else: writer.writerow([unicode(x) for x in ['id', 'predicted', 'coded', 'confidence', 'correct?', 'diff', 'snippet']]) vecs = map(lambda x: x.vector, vectors) output = {} for (lname, labs) in labels: m = models[lname] if m == None: print >>sys.stderr, lname continue print lname + ': ' lab,acc,val = liblinear.linearutil.predict(labs, vecs, m, '-b 1') # print performances and failure cases pn = pn_t({True: 0, False: 0}, {True: 0, False: 0}) for (i,pred) in enumerate(lab): ok = bool(pred) == labs[i] res = 'Yes' if ok else 'No' if labs[i] == None: res = 'Unknown' else: if pred > 0: pn.p[ok] += 1 else: pn.n[ok] += 1 revid = vectors[i].raw['id']['rev_id'] if vectors[i].raw['id'].has_key('rev_id') else None link = 'http://enwp.org/?diff=prev&oldid=%s' % revid ls = [lname, repr(vectors[i].raw['id']), bool(pred),
dest='verbose', action='store_true', default=False, help='turn on verbose message output') parser.add_argument('input', nargs='+', type=lambda x: open(x)) options = parser.parse_args() # load raw table of coded examples csv.field_size_limit(1000000000) table = [] for f in options.input: t = list(csv.reader(f, delimiter=options.delimiter)) table += t[1:] pns = {} for cols in table: lab,pred,code = [cols[x] for x in [options.label, options.pred, options.code]] pn = pns.setdefault(lab, pn_t({True: 0, False: 0}, {True: 0, False: 0})) if code == options.ignore: None else: ok = (pred == code) if pred == options.positive: pn.p[ok] += 1 else: pn.n[ok] += 1 for (lab,pn) in sorted(pns.items(), key=lambda x: x[0]): print lab numcorrect = pn.p[True] + pn.n[True] numwrong = pn.p[False] + pn.n[False] print ' accuracy = %f (%d/%d)' % (float(numcorrect) / (numcorrect + numwrong) if numcorrect + numwrong > 0 else float('nan'), numcorrect, (numcorrect + numwrong))