示例#1
0
    query = {'vector': {'$exists': True}}
    query.update(ast.literal_eval(options.find))
    cursor = db.find(query)
    print >>sys.stderr, 'labeld examples: %s out of %s' % (cursor.count(), db.count())

    vectors = []
    labels = {}
    for x in models.keys():
        labels[x] = []
    for ent in cursor:
        for name in labels.keys():
            value = None
            if ent.has_key('labels') and ent['labels'].has_key(name):
                value = ent['labels'][name] if 1 else -1
            labels.setdefault(name, []).append(value)
        vectors.append(entry_t(ent['entry'], ent['features'], myutils.map_key_dict(int, ent['vector'])))

    for (name,vals) in labels.items():
        assert len(vectors) == len(vals), [len(vectors), len(vals), name]

    labels = sorted(labels.items(), key=lambda x: x[0])

    writer = csv.writer(options.output, delimiter='\t')
    if options.aggregate:
        writer.writerow([unicode(x) for x in ['id'] + [x[0] for x in labels] + ['diff', 'snippet']])
    else:
        writer.writerow([unicode(x) for x in ['id', 'predicted', 'coded', 'confidence', 'correct?', 'diff', 'snippet']])
    vecs = map(lambda x: x.vector, vectors)
    output = {}
    for (lname, labs) in labels:
        m = models[lname]
    # establish MongoDB connection
    collection = myutils.get_mongodb_collection(options.hosts, options.database)

    # load models for each label
    models = test.load_models(collection['models'], ast.literal_eval(options.model))

    cursor = myutils.get_mysql_connection(options.host, options.db).cursor()
    # contruct the testing set from the MediaWiki table
    vectors = []
    for ent in wikilove_revs.get_entries(cursor, options.start, options.end, options.window, options.limit, newest=True):
        features = extract_features.extract_features({'entry': {'content': {'added': [ent.others.message], 'removed':[]},
                                                                'comment': ''}})
        vector = myutils.map_key_dict(int, extract_features.extract_vector(features, options.bits))
        if ent.receiver_id != ent.sender_id:
            vectors.append(myutils.entry_t(ent, features, vector))

    labels = sorted(models.keys())
    
    vecs = [x.vector for x in vectors]
    predictions = [[[] for y in xrange(0, len(labels))] for x in xrange(0,len(vectors))]
    for (n,lname) in enumerate(labels):
        lab,_,val = liblinear.linearutil.predict([0]*len(vecs), vecs, models[lname], '-b 1')
        for (i,(pred,score)) in enumerate(zip(lab,val)):
            predictions[i][n] = score[1] # get the confidence for the label being 'True'

    print >>options.output, '<style type="text/css">.prediction{text-align: right;} td{vertical-align: top;} li{border: 1px solid; list-style: none inside; margin: 0.2em;} ul{padding: 0;} blockquote{ font: normal italic  100% serif; }</style>'
    print >>options.output, '<body style="background: #EEE;">Generated at %s.' % str(datetime.now())
    print >>options.output, '<table style="background: white; width: 100%"><tr>'
    for (i,x) in enumerate(labels):
        print >>options.output, '<th>%s: %d out of %d (>%f)</th>' % (x, len(filter(lambda x: x[i] > options.threshold, predictions)), len(predictions), options.threshold)