Exemplo n.º 1
0
def predict_attr(data_set, org_attributes, adj_vecs, attr_vecs, output_file):
    adj_attr_cosine = np.dot(adj_vecs, attr_vecs.T)
    index_to_max_att_per_adj = np.argmax(
        adj_attr_cosine,
        axis=1)  # take the row index of the max score for each attribute

    lcs_attr_idx = []
    for idx, samp in enumerate(data_set):
        attr_lcs = [lcs(attr, samp.adj) for attr in org_attributes]
        lcs_len = max(attr_lcs)
        attr_idx = attr_lcs.index(lcs_len)
        # attr = org_attributes[attr_idx]
        lcs_attr_idx.append(attr_idx)
        if lcs_len > 4:
            # if samp.attr == org_attributes[attr_idx]  :
            index_to_max_att_per_adj[idx] = attr_idx
            # elif org_attributes[index_to_max_att_per_adj[idx]]== samp.attr:
            #     print " ".join ([samp.attr.upper(), samp.adj, samp.noun,org_attributes[attr_idx].upper()])

    correct = 0.0
    results = []
    for samp_id, attr_idx in enumerate(index_to_max_att_per_adj):
        if data_set[samp_id].attr == org_attributes[attr_idx]:
            correct += 1
        res = AdjNounAttribute(data_set[samp_id].adj, data_set[samp_id].noun,
                               org_attributes[attr_idx])
        results.append(res)
    print "According to adj-attr similarity:"
    print "Total samples: [{}]. Correct: [{}]. Accuracy: [{}]".format(
        len(data_set), correct, correct / len(data_set))
    with open(output_file, 'w') as f:
        for res in results:
            row = '\t'.join([res.attr.upper(), res.adj, res.noun]) + '\n'
            f.write(row)
def read_HeiPLAS_data(file_path):
    with open(file_path) as f:
        input_list = [line.split() for line in f.readlines()]
    data = [
        AdjNounAttribute(item[1], item[2], item[0].lower())
        for item in input_list
    ]
    return data
Exemplo n.º 3
0
def test():

    weights = nn_model.linear_1.weight.data.numpy()

    filtered_test_samp = [
        samp for samp in test_triplets if samp.adj in we_wrapper.vocab
        and samp.noun in we_wrapper.vocab and samp.attr in we_wrapper.vocab
    ]
    print "after filter missing words, testing samples: " + str(
        len(filtered_test_samp))

    x_test = np.array([
        we_wrapper.adj_vec_by_context(samp.adj, samp.noun)
        for samp in filtered_test_samp
    ])
    y_test = np.array(
        [we_wrapper.word_vec(samp.attr) for samp in filtered_test_samp])
    attr_vecs = {
        attr: we_wrapper.word_vec(attr)
        for attr in attributes if attr in we_wrapper.vocab
    }

    print "attr_vecs size = {}".format(len(attr_vecs))
    print "x test shape: " + str(x_test.shape)
    print "y_test: " + str(y_test.shape)
    print "weights shape: {}".format(weights.shape)

    x_test_matrix = np.dot(weights, np.transpose(x_test))
    print "x_test matrix shape = {}".format(x_test_matrix.shape)

    # check P@1 and P@5 accuracy
    correct = 0.0
    top_5_correct = 0.0
    correct_pred = []
    false_pred = []
    results = []
    for i in xrange(0, x_test_matrix.shape[1]):
        y_pred = x_test_matrix[:, [i]]

        #TODO: adapt this from huang to my senses
        # cosine_sims = {attr: max( [(1-spatial.distance.cosine(y_pred.T, attr_vecs[attr]))
        #                         for attr in attr_vecs.keys()])}
        # sorted_sims = dict(sorted(cosine_sims.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])
        # most_sim_attr = max(sorted_sims, key=lambda i: sorted_sims[i])

        #calculate cosine similarity for normalized vectors
        cosine_sims = {
            attr: np.dot(y_pred.T, attr_vecs[attr])
            for attr in attr_vecs.keys()
        }
        sorted_sims = dict(
            sorted(cosine_sims.iteritems(),
                   key=operator.itemgetter(1),
                   reverse=True)[:5])
        most_sim_attr = max(sorted_sims, key=lambda i: sorted_sims[i])
        if most_sim_attr == filtered_test_samp[i].attr:
            correct += 1
            correct_pred.append(filtered_test_samp[i])
        else:
            false_pred.append((filtered_test_samp[i], most_sim_attr))
        if filtered_test_samp[i].attr in sorted_sims.keys():
            top_5_correct += 1
        results.append((filtered_test_samp[i], most_sim_attr))
    print "supervised results"
    print "correct: {} from total: {}. Accuracy: {}".format(
        correct, y_test.shape[0], correct / y_test.shape[0])
    print "top 5 correct: {} from total: {}. Accuracy: {}".format(
        top_5_correct, y_test.shape[0], top_5_correct / y_test.shape[0])

    file = open(args.output_folder + '/' + correct_predictions_file, 'w')
    for item in correct_pred:
        string = ' '.join([item.attr.upper(), item.adj, item.noun])
        print >> file, string

    file = open(args.output_folder + '/' + false_prediction_file, 'w')
    for item in false_pred:
        string = ' '.join(
            [item[0].attr.upper(), item[0].adj, item[0].noun, item[1].upper()])
        print >> file, string

    file = open(args.output_folder + '/' + test_results, 'w')
    for item in results:
        string = ' '.join(
            [item[0].attr.upper(), item[0].adj, item[0].noun, item[1].upper()])
        print >> file, string

    correct = 0.0
    correct_in_K = 0.0
    predictions = []
    unique_attributes = attr_vecs.keys()
    # attr_vecs_2 = np.array([we_wrapper.org_model.word_vec(attr) for attr in unique_attributes]).squeeze()
    attr_vecs_2 = np.array(
        [we_wrapper.word_vec(attr) for attr in unique_attributes]).squeeze()
    for test in filtered_test_samp:
        # print "{} {}".format(test.adj, test.noun)

        adj_label = we_wrapper.get_adj_name(test.adj, test.noun)
        adj_vec = we_wrapper.adj_vec_by_context(test.adj, test.noun)
        # adj_vec = we_wrapper.org_model.word_vec(test.adj)
        sim = np.dot(adj_vec, attr_vecs_2.T)
        attr_ids = sim.argsort()[-K:][::-1]
        adj_preds = [unique_attributes[i] for i in attr_ids]

        predictions.append(
            (AdjNounAttribute(test.adj, test.noun,
                              test.attr), adj_preds[0], adj_label))
        if adj_preds[0] == test.attr:
            correct += 1
        if test.attr in adj_preds:
            correct_in_K += 1

    file = open(args.output_folder + '/' + unsupervised_results, 'w')
    for item in predictions:
        string = ' '.join([
            item[0].attr.upper(), item[0].adj, item[0].noun, item[1].upper(),
            item[2]
        ])
        print >> file, string
    print "----unsupervised results-----"
    print "correct = {}, total: {}, accuracy: {}".format(
        correct, len(filtered_test_samp), correct / len(filtered_test_samp))
    print "correct_in_{} = {}, total: {}, accuracy: {}".format(
        K, correct_in_K, len(filtered_test_samp),
        correct_in_K / len(filtered_test_samp))