Пример #1
0
def generate_diff_data(entities, person_mat, org_mat, loc_mat):
    person_result, org_result, loc_result = manual_ranker(entities, person_mat, org_mat, loc_mat)
    data_pair = [("PERSON", person_mat, person_result), ("ORGANIZATION", org_mat, org_result), ("LOCATION", loc_mat, loc_result)]
    ret = []
    for type, mat, result in data_pair:
        print "\n\nShowing", type
        for i in result:
            print "%d: %s" % (i, entities[type][i][0])
        remainder = set(result)
        layers = []
        while True:
            res = raw_input("Good results:")
            if not res:
                break
            res_str = res.split()
            res = set()
            for num in res_str:
                res.add(int(num))
            layers.append(res)
            remainder -= res
        layers.append(remainder)
        Xs, ys = [], []
        for i in xrange(len(layers) - 1):
            for j in xrange(i + 1, len(layers)):
                for a in layers[i]:
                    for b in layers[j]:
                        Xs.append(dist_vector(mat[a], mat[b]))
                        ys.append(1)
                        Xs.append(dist_vector(mat[b], mat[a]))
                        ys.append(-1)
        ret.append((Xs, ys))
    return ret
Пример #2
0
def ml_ranker(entities, person_mat, org_mat, loc_mat):
    clf_person, clf_org, clf_loc = pickle.load(open("classifiers.dat", "r"))
    data_set = [(person_mat, clf_person), (org_mat, clf_org), (loc_mat, clf_loc)]
    ret = []
    scores = []
    for data, clf in data_set:
        normalize(data)
        score = [0] * len(data)
        for i in xrange(len(data) - 1):
            for j in xrange(i + 1, len(data)):
                if clf.predict(dist_vector(data[i], data[j])) == [1]:
                    score[i] += 1
                else:
                    score[j] += 1
                if clf.predict(dist_vector(data[j], data[i])) == [-1]:
                    score[i] += 1
                else:
                    score[j] += 1
        result = sorted(range(len(data)), key=lambda i: -score[i])
        ret.append(result)
        scores.append(score)
    return ret, scores