Пример #1
0
def vectorize(examples, word_dicts, args, print_allowed=False, labels=[]):
    """
        Vectorize `examples`.
        in_x1, in_x2: sequences for document and question respecitvely.
        in_y: label
        in_l: whether the entity label occurs in the document.
    """
    inxs = []
    masks = []
    ATTRIBUTES=config.get_columns(args.experiment)
    if print_allowed:
        all_givens = []
        print_me="<h4>ENTITY: "
        for j in range(len(examples)):
            logging.info(len(examples[j]))
            this_row={}
            for i in range(args.relations):
                if examples[j][i]!="":
                    #print_me+="%s=<a href=\"%s\">%s</a>, " % (ATTRIBUTES[i], examples[j][i],examples[j][i]) # labels[i][examples[j][i].split('/')[-1]])
                    print_me+="%s=%s, " % (ATTRIBUTES[i], examples[j][i])
                    this_row[ATTRIBUTES[i]]=examples[j][i]
            all_givens.append(this_row)
            #print(examples[j])
        if len(examples):
            print print_me + '</h4>'
        with open('given.pkl', 'wb') as p:
            pickle.dump(all_givens, p)
    for i in range(args.relations):
        in_data = []
        mask = []
        for j in range(len(examples)):
            if examples[j][i] in word_dicts[i]:
                in_data += [word_dicts[i][examples[j][i]]]
            else:
                in_data += [1] #unk
            if examples[j][i].strip() == "":
                mask += [0]
            else:
                mask += [1]
        inxs += [in_data]
        masks += [mask]
    return np.array(inxs).astype('int32'), np.array(masks).astype('float32')
Пример #2
0
            y_batch += [yrow]
            cnt += 1
    if cnt % batch_size:
        print('remaining %d' % (cnt % batch_size))
        yield np.array(x_batch), np.array(y_batch)

        #x+=[x_batch]
        #y+=[y_batch]
    #return len(x), list(all_classes)


if __name__ == "__main__":

    entity_type = sys.argv[1]
    if entity_type == 'american':
        attributes = config.get_columns(True)
    else:
        attributes = config.get_columns()[:-1]
    N = len(attributes)

    print("%d attributes" % N)

    batch_size = 32768

    data_path = '../data/%s' % entity_type
    print("Data path: %s" % data_path)
    mapping = pickle.load(open('%s/train_dicts.pickle' % data_path, 'rb'))
    inv_mapping = pickle.load(
        open('%s/train_inv_dicts.pickle' % data_path, 'rb'))
    train_path = '%s/train.txt' % data_path
    test_path = '%s/test.txt' % data_path
Пример #3
0
import numpy as np
import theano
import theano.tensor as T
import lasagne
import pickle
from collections import defaultdict

import sys
import time
import utils
import config
import logging
import nn_layers
import lasagne.layers as L

ATTRIBUTES=config.get_columns()
MAX_NO_PROGRESS=10
labels_file='labels.pkl'
#['educated at', 'sex or gender', 'country of citizenship', 'native language', 'notable work', 'award received', "religion", 'participant of', 'member of political party', 'member of sports team']


def gen_examples(data, mask, batch_size, concat=False):
    """
        Divide examples into batches of size `batch_size`.
    """
    minibatches = utils.get_minibatches(len(data[0]), batch_size)
    all_ex = []
    for minibatch in minibatches:
        dm = []
        for d in data:
            dm += [d[minibatch]]
Пример #4
0
        attribute_values = vals[attribute]
        sorted_av = sorted(attribute_values.items(),
                           key=operator.itemgetter(1))
        max_vals[attribute] = {sorted_av[-1][0]: 0}

    print(max_vals)

    return max_vals


if __name__ == "__main__":
    entity_type = sys.argv[1]
    data_path = '../data/%s' % entity_type
    word_dicts = pickle.load(open('%s/train_dicts.pickle' % data_path, 'rb'))
    if entity_type == 'american':
        cols = config.get_columns(True)
    else:
        cols = config.get_columns()[:-1]
    num_attr = len(cols)  # -1 for the embeddings thingie
    max_vals = get_most_frequent_values_train(data_path, num_attr, word_dicts)

    test_path = '%s/test.txt' % data_path
    filled = defaultdict(int)
    acc = {}
    with open(test_path, 'rb') as data:
        spamreader = csv.reader(data, delimiter='\t', quotechar='"')
        for row in spamreader:
            for i in xrange(num_attr):
                if row[i]:
                    filled[i] += 1
                    if row[i] in max_vals[i]:
Пример #5
0
def eval_acc(test_fn,
             all_examples,
             inv_word_dicts,
             topk_acc=1,
             print_allowed=False,
             labels_data=[]):
    """
        Evaluate accuracy on `all_examples`.
    """
    acc = np.zeros((args.relations))
    n_examples = np.zeros((args.relations))
    #if print_allowed:
    my_rel_num = 10
    acc_per_num = {}
    attr_acc = {}
    influences = {}
    ATTRIBUTES = config.get_columns(args.experiment)
    count_per_total = defaultdict(int)
    all_predictions = defaultdict(list)
    for i in range(len(ATTRIBUTES)):
        acc_per_num[i] = {'c': 0, 'i': 0}
        attr_acc[i] = {'c': 0, 'i': 0}
        influences[i] = {'c': 0, 'i': 0}
    print('SIZE', len(all_examples))
    for inps in all_examples:
        if args.complete_AE:
            #if True:
            tot_acc = test_fn(*inps)
            tot_acc = tot_acc[:args.relations]
        else:
            #logging.info(inps[3])
            tot_acc = []
            totals = defaultdict(int)
            corrects = defaultdict(int)
            attr_totals = defaultdict(int)
            attr_corrects = defaultdict(int)
            present = defaultdict(int)
            for i in range(args.relations):
                for index, val in enumerate(inps[i + args.relations]):
                    if val > 0:
                        totals[index] += 1
            correct = 0
            for k in range(args.relations):
                new_inp = []
                #if print_allowed:
                for i in range(args.relations):
                    if i == k:
                        new_inp += [
                            np.zeros((inps[0].shape[0], )).astype("int32")
                        ]
                    else:
                        #new_inp += [inps[i] * np.random.binomial(1, args.dropout_rate, inps[i].shape).astype("int32")]
                        new_inp += [inps[i]]
                #logging.info(inps[k])
                new_inp += inps[args.relations:]
                tmp_acc_prob = test_fn(*new_inp)
                prob = tmp_acc_prob[k + args.relations]
                sort_index = prob.argsort(axis=1)
                #logging.info(sort_index)
                real_acc = 0
                #logging.info(prob)
                for j in range(prob.shape[0]):
                    if print_allowed:
                        to_print = "<ol>"
                        top_predictions = []
                        print "System predictions for %s:" % ATTRIBUTES[k]
                        for prediction in range(
                                1, min(topk_acc + 1, len(sort_index[j]))):
                            my_label = ''
                            choice = inv_word_dicts[k][str(
                                sort_index[j][-prediction])]
                            choice_prob = prob[j][sort_index[j][-prediction]]
                            #if choice.strip()!='':
                            #    my_label=labels_data[k][choice.split('/')[-1]].encode('utf-8')
                            to_print += "%s (%s)" % (choice, choice_prob
                                                     )  #my_label)
                            top_predictions.append(tuple([choice,
                                                          choice_prob]))
                        all_predictions[ATTRIBUTES[k]].append(top_predictions)
                        to_print += "</ol>"
                        try:
                            print to_print
                        except Exception as exc:
                            print exc
                    if k == my_rel_num and inps[k + args.relations][j] == 1:
                        s = 0
                        present_attrs = set()
                        for kr in range(args.relations):
                            s += inps[kr + args.relations][j]
                            if inps[kr + args.relations][j] > 0:
                                present_attrs.add(kr)
                        attr_totals[j] = s
                        attr_corrects[j] = 0
                        present[j] = present_attrs
                    if inps[k + args.relations][j] == 1 and prob[j][
                            inps[k][j]] >= prob[j][sort_index[j][
                                (-1) * min(topk_acc, len(sort_index[j]))]]:
                        real_acc += 1
                        corrects[j] += 1
                        if k == my_rel_num:
                            attr_corrects[j] += 1
                            #logging.info(sort_index[j][-1])
                            #logging.info("System was correct for attribute 3 in row %d. Value: %d" % (j, inps[k][j]))
                        #logging.info(prob[j][inps[k][j]])
                        #logging.info(prob[j][sort_index[j][-3]])
                #if print_allowed:
                tot_acc.append(real_acc)
            for row, total in totals.items():
                acc_per_num[total - 1]['c'] += corrects[row]
                acc_per_num[total - 1]['i'] += total - corrects[row]
                count_per_total[total] += 1
            for row, total in attr_totals.items():
                attr_acc[total - 1]['c'] += attr_corrects[row]
                attr_acc[total - 1]['i'] += (1 - attr_corrects[row])
                c_or_i = ''
                if attr_corrects[row] > 0:
                    c_or_i = 'c'
                else:
                    c_or_i = 'i'
                for elem in present[row]:
                    if elem != my_rel_num:
                        influences[elem][c_or_i] += 1
        for k in range(args.relations):
            n_examples[k] += inps[k + args.relations].sum()
        acc += np.array(tot_acc)
    #if print_allowed:
    logging.info(acc)
    logging.info(acc_per_num)
    logging.info(count_per_total)
    with open('predicted.pkl', 'wb') as p:
        pickle.dump(all_predictions, p)
    my_graph = {}
    for k in attr_acc:
        if attr_acc[k]['c'] + attr_acc[k]['i'] >= 50:
            my_graph[k] = attr_acc[k]['c'] * 1.0 / (attr_acc[k]['c'] +
                                                    attr_acc[k]['i'])
    logging.info(attr_acc)
    logging.info(my_graph)
    #logging.info(influences)
    #logging.info('Attribute 3 stats. Total of %d examples, %d correct, %d incorrect' % (n_examples[3], attr_correct, attr_incorrect))
    return (acc * 100.0 / n_examples).tolist()
Пример #6
0
def create_data(entity_type, embeddings, experiment):
    if embeddings:
        fn="../data/%s_emb_data.tsv" % entity_type
    else:
        fn="../data/tabular_%s_data.tsv" % entity_type

    inp = open(fn, "r")
    meta = inp.readline().strip()
    meta = meta.split("\t")
#    if meta[0]=='instance uri':
    meta = ['unnamed'] + meta
   
    #if meta[0]=='instance uri': 
    #    meta=meta[1:]
    #dob_index=meta.index("date of birth")

    # NEEDED FOR MOST EXPERIMENTS MINUS THE CURRENT ONE
    if experiment!='gvamerican':
        try:
            dob_index=meta.index('century')
        except:
            dob_index=meta.index('date of birth')
        ls_index=meta.index("lifespan")
        print(dob_index, ls_index)

    cols = get_row(config.get_columns(experiment), meta)
    data = []
    print meta
    print cols
    for i, st in enumerate(inp.readlines()):
        if st.strip()=='':
            continue
        st=st.strip('\n')
        info = st.split("\t")
        #print(i, meta.index("embeddings"), len(info))
        if embeddings and (len(info)<=meta.index("embeddings") or info[meta.index('embeddings')].strip()==''):
            continue
        d = []
        for col in cols:
            try:
                st=info[col]
            except:
                st=''
                print(col, 'yo')
            if st!='':
                # Enable the next 6 lines for the experiment #1
#                if col==dob_index: # date of birth
#                    period_group=int(float(st[:4]))/time_slicing_factor
#                    st='%d-%d' % (period_group*time_slicing_factor, (period_group+1)*time_slicing_factor)
#                elif col==ls_index: # lifespan
#                    lifespan_group=int(float(st))/lifespan_factor
#                    st='%d-%d' % (lifespan_group*lifespan_factor, (lifespan_group+1)*lifespan_factor)
                if st.find("}") != -1: # if there are multiple values, get the one with the lowest id
                    #pos = st.rfind(",")
                    #rpos = st.rfind("\'")
                    #st = st[pos + 3: rpos].strip()
                    line = st[1:-1].split(", ")
                    min_pos = 0
                    for j in range(len(line)):
                        if len(line[j]) < len(line[min_pos]) or len(line[j]) == len(line[min_pos]) and line[j] < line[min_pos]:
                            min_pos = j
                    st = line[min_pos][1:-1]
            d += [st]
        data += [d]
    random.shuffle(data)
    if embeddings:
        dir_path = "../data/%smini/" % entity_type
    else:
        dir_path = "../data/%s/" % entity_type
    sets = ["train", "dev", "test"]
    outdirs = [dir_path + sets[i] + ".txt" for i in range(len(sets))]
    ratio = [0.85, 0.15, 0.0]
    cnt = 0
    for i in range(len(sets)):
        oup = open(outdirs[i], "w")
        for j in range(int(len(data) * ratio[i])):
            oup.write("\t".join(data[cnt]) + "\n")
            cnt += 1
        oup.close()
    print(outdirs)
    return outdirs[0], dir_path