def vectorize(examples, word_dicts, args, print_allowed=False, labels=[]): """ Vectorize `examples`. in_x1, in_x2: sequences for document and question respecitvely. in_y: label in_l: whether the entity label occurs in the document. """ inxs = [] masks = [] ATTRIBUTES=config.get_columns(args.experiment) if print_allowed: all_givens = [] print_me="<h4>ENTITY: " for j in range(len(examples)): logging.info(len(examples[j])) this_row={} for i in range(args.relations): if examples[j][i]!="": #print_me+="%s=<a href=\"%s\">%s</a>, " % (ATTRIBUTES[i], examples[j][i],examples[j][i]) # labels[i][examples[j][i].split('/')[-1]]) print_me+="%s=%s, " % (ATTRIBUTES[i], examples[j][i]) this_row[ATTRIBUTES[i]]=examples[j][i] all_givens.append(this_row) #print(examples[j]) if len(examples): print print_me + '</h4>' with open('given.pkl', 'wb') as p: pickle.dump(all_givens, p) for i in range(args.relations): in_data = [] mask = [] for j in range(len(examples)): if examples[j][i] in word_dicts[i]: in_data += [word_dicts[i][examples[j][i]]] else: in_data += [1] #unk if examples[j][i].strip() == "": mask += [0] else: mask += [1] inxs += [in_data] masks += [mask] return np.array(inxs).astype('int32'), np.array(masks).astype('float32')
y_batch += [yrow] cnt += 1 if cnt % batch_size: print('remaining %d' % (cnt % batch_size)) yield np.array(x_batch), np.array(y_batch) #x+=[x_batch] #y+=[y_batch] #return len(x), list(all_classes) if __name__ == "__main__": entity_type = sys.argv[1] if entity_type == 'american': attributes = config.get_columns(True) else: attributes = config.get_columns()[:-1] N = len(attributes) print("%d attributes" % N) batch_size = 32768 data_path = '../data/%s' % entity_type print("Data path: %s" % data_path) mapping = pickle.load(open('%s/train_dicts.pickle' % data_path, 'rb')) inv_mapping = pickle.load( open('%s/train_inv_dicts.pickle' % data_path, 'rb')) train_path = '%s/train.txt' % data_path test_path = '%s/test.txt' % data_path
import numpy as np import theano import theano.tensor as T import lasagne import pickle from collections import defaultdict import sys import time import utils import config import logging import nn_layers import lasagne.layers as L ATTRIBUTES=config.get_columns() MAX_NO_PROGRESS=10 labels_file='labels.pkl' #['educated at', 'sex or gender', 'country of citizenship', 'native language', 'notable work', 'award received', "religion", 'participant of', 'member of political party', 'member of sports team'] def gen_examples(data, mask, batch_size, concat=False): """ Divide examples into batches of size `batch_size`. """ minibatches = utils.get_minibatches(len(data[0]), batch_size) all_ex = [] for minibatch in minibatches: dm = [] for d in data: dm += [d[minibatch]]
attribute_values = vals[attribute] sorted_av = sorted(attribute_values.items(), key=operator.itemgetter(1)) max_vals[attribute] = {sorted_av[-1][0]: 0} print(max_vals) return max_vals if __name__ == "__main__": entity_type = sys.argv[1] data_path = '../data/%s' % entity_type word_dicts = pickle.load(open('%s/train_dicts.pickle' % data_path, 'rb')) if entity_type == 'american': cols = config.get_columns(True) else: cols = config.get_columns()[:-1] num_attr = len(cols) # -1 for the embeddings thingie max_vals = get_most_frequent_values_train(data_path, num_attr, word_dicts) test_path = '%s/test.txt' % data_path filled = defaultdict(int) acc = {} with open(test_path, 'rb') as data: spamreader = csv.reader(data, delimiter='\t', quotechar='"') for row in spamreader: for i in xrange(num_attr): if row[i]: filled[i] += 1 if row[i] in max_vals[i]:
def eval_acc(test_fn, all_examples, inv_word_dicts, topk_acc=1, print_allowed=False, labels_data=[]): """ Evaluate accuracy on `all_examples`. """ acc = np.zeros((args.relations)) n_examples = np.zeros((args.relations)) #if print_allowed: my_rel_num = 10 acc_per_num = {} attr_acc = {} influences = {} ATTRIBUTES = config.get_columns(args.experiment) count_per_total = defaultdict(int) all_predictions = defaultdict(list) for i in range(len(ATTRIBUTES)): acc_per_num[i] = {'c': 0, 'i': 0} attr_acc[i] = {'c': 0, 'i': 0} influences[i] = {'c': 0, 'i': 0} print('SIZE', len(all_examples)) for inps in all_examples: if args.complete_AE: #if True: tot_acc = test_fn(*inps) tot_acc = tot_acc[:args.relations] else: #logging.info(inps[3]) tot_acc = [] totals = defaultdict(int) corrects = defaultdict(int) attr_totals = defaultdict(int) attr_corrects = defaultdict(int) present = defaultdict(int) for i in range(args.relations): for index, val in enumerate(inps[i + args.relations]): if val > 0: totals[index] += 1 correct = 0 for k in range(args.relations): new_inp = [] #if print_allowed: for i in range(args.relations): if i == k: new_inp += [ np.zeros((inps[0].shape[0], )).astype("int32") ] else: #new_inp += [inps[i] * np.random.binomial(1, args.dropout_rate, inps[i].shape).astype("int32")] new_inp += [inps[i]] #logging.info(inps[k]) new_inp += inps[args.relations:] tmp_acc_prob = test_fn(*new_inp) prob = tmp_acc_prob[k + args.relations] sort_index = prob.argsort(axis=1) #logging.info(sort_index) real_acc = 0 #logging.info(prob) for j in range(prob.shape[0]): if print_allowed: to_print = "<ol>" top_predictions = [] print "System predictions for %s:" % ATTRIBUTES[k] for prediction in range( 1, min(topk_acc + 1, len(sort_index[j]))): my_label = '' choice = inv_word_dicts[k][str( sort_index[j][-prediction])] choice_prob = prob[j][sort_index[j][-prediction]] #if choice.strip()!='': # my_label=labels_data[k][choice.split('/')[-1]].encode('utf-8') to_print += "%s (%s)" % (choice, choice_prob ) #my_label) top_predictions.append(tuple([choice, choice_prob])) all_predictions[ATTRIBUTES[k]].append(top_predictions) to_print += "</ol>" try: print to_print except Exception as exc: print exc if k == my_rel_num and inps[k + args.relations][j] == 1: s = 0 present_attrs = set() for kr in range(args.relations): s += inps[kr + args.relations][j] if inps[kr + args.relations][j] > 0: present_attrs.add(kr) attr_totals[j] = s attr_corrects[j] = 0 present[j] = present_attrs if inps[k + args.relations][j] == 1 and prob[j][ inps[k][j]] >= prob[j][sort_index[j][ (-1) * min(topk_acc, len(sort_index[j]))]]: real_acc += 1 corrects[j] += 1 if k == my_rel_num: attr_corrects[j] += 1 #logging.info(sort_index[j][-1]) #logging.info("System was correct for attribute 3 in row %d. Value: %d" % (j, inps[k][j])) #logging.info(prob[j][inps[k][j]]) #logging.info(prob[j][sort_index[j][-3]]) #if print_allowed: tot_acc.append(real_acc) for row, total in totals.items(): acc_per_num[total - 1]['c'] += corrects[row] acc_per_num[total - 1]['i'] += total - corrects[row] count_per_total[total] += 1 for row, total in attr_totals.items(): attr_acc[total - 1]['c'] += attr_corrects[row] attr_acc[total - 1]['i'] += (1 - attr_corrects[row]) c_or_i = '' if attr_corrects[row] > 0: c_or_i = 'c' else: c_or_i = 'i' for elem in present[row]: if elem != my_rel_num: influences[elem][c_or_i] += 1 for k in range(args.relations): n_examples[k] += inps[k + args.relations].sum() acc += np.array(tot_acc) #if print_allowed: logging.info(acc) logging.info(acc_per_num) logging.info(count_per_total) with open('predicted.pkl', 'wb') as p: pickle.dump(all_predictions, p) my_graph = {} for k in attr_acc: if attr_acc[k]['c'] + attr_acc[k]['i'] >= 50: my_graph[k] = attr_acc[k]['c'] * 1.0 / (attr_acc[k]['c'] + attr_acc[k]['i']) logging.info(attr_acc) logging.info(my_graph) #logging.info(influences) #logging.info('Attribute 3 stats. Total of %d examples, %d correct, %d incorrect' % (n_examples[3], attr_correct, attr_incorrect)) return (acc * 100.0 / n_examples).tolist()
def create_data(entity_type, embeddings, experiment): if embeddings: fn="../data/%s_emb_data.tsv" % entity_type else: fn="../data/tabular_%s_data.tsv" % entity_type inp = open(fn, "r") meta = inp.readline().strip() meta = meta.split("\t") # if meta[0]=='instance uri': meta = ['unnamed'] + meta #if meta[0]=='instance uri': # meta=meta[1:] #dob_index=meta.index("date of birth") # NEEDED FOR MOST EXPERIMENTS MINUS THE CURRENT ONE if experiment!='gvamerican': try: dob_index=meta.index('century') except: dob_index=meta.index('date of birth') ls_index=meta.index("lifespan") print(dob_index, ls_index) cols = get_row(config.get_columns(experiment), meta) data = [] print meta print cols for i, st in enumerate(inp.readlines()): if st.strip()=='': continue st=st.strip('\n') info = st.split("\t") #print(i, meta.index("embeddings"), len(info)) if embeddings and (len(info)<=meta.index("embeddings") or info[meta.index('embeddings')].strip()==''): continue d = [] for col in cols: try: st=info[col] except: st='' print(col, 'yo') if st!='': # Enable the next 6 lines for the experiment #1 # if col==dob_index: # date of birth # period_group=int(float(st[:4]))/time_slicing_factor # st='%d-%d' % (period_group*time_slicing_factor, (period_group+1)*time_slicing_factor) # elif col==ls_index: # lifespan # lifespan_group=int(float(st))/lifespan_factor # st='%d-%d' % (lifespan_group*lifespan_factor, (lifespan_group+1)*lifespan_factor) if st.find("}") != -1: # if there are multiple values, get the one with the lowest id #pos = st.rfind(",") #rpos = st.rfind("\'") #st = st[pos + 3: rpos].strip() line = st[1:-1].split(", ") min_pos = 0 for j in range(len(line)): if len(line[j]) < len(line[min_pos]) or len(line[j]) == len(line[min_pos]) and line[j] < line[min_pos]: min_pos = j st = line[min_pos][1:-1] d += [st] data += [d] random.shuffle(data) if embeddings: dir_path = "../data/%smini/" % entity_type else: dir_path = "../data/%s/" % entity_type sets = ["train", "dev", "test"] outdirs = [dir_path + sets[i] + ".txt" for i in range(len(sets))] ratio = [0.85, 0.15, 0.0] cnt = 0 for i in range(len(sets)): oup = open(outdirs[i], "w") for j in range(int(len(data) * ratio[i])): oup.write("\t".join(data[cnt]) + "\n") cnt += 1 oup.close() print(outdirs) return outdirs[0], dir_path