def get_test_type_occurrence(): figer_test = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_test_word_with_context.txt', \ context_file = 'data/state_of_the_art_test_tagged_context.txt', \ feature_file = 'data/state_of_the_art_test_Feature.npy', \ entity_type_feature_file = 'data/state_of_the_art_test_et_features.npy',\ entity_type_exact_feature_file = 'data/state_of_the_art_test_exact_et_features.npy',\ type_file = 'data/state_of_the_art_test_Types_with_context.npy') batch_data = figer_test.next_batch(range(0, 113)) a = np.sum(batch_data[-1], 0) b = np.zeros((113, 2)) for i in range(0, 113): b[i][0] = i b[i][1] = a[i] dicts = joblib.load( '/home/zys133/knowledge_base/NFGEC/data/Wiki/dicts_figer.pkl') with open('data/test_type_count.pkl', 'w') as f: pickle.dump(b, f) b = b[b[:, 1].argsort()[::-1]] for i in range(0, 113): if b[i][1] > 0.0: # print '{}\t{}\t{}'.format(dicts['id2label'][int(b[i][0])], int(b[i][0]), b[i][1]) print('{}\t{}'.format(dicts['id2label'][int(b[i][0])], b[i][1]))
def get_train_type_freq(): figer = figer_data_multi_label_batcher.figer_data_multi_label() total_count = np.zeros(113) for i in range(0, 200): Ys = figer.next_batch(range(0, 113))[-1] print(Ys.shape) for j in range(0, Ys.shape[0]): for k in range(0, 113): total_count[k] += Ys[j][k] with open('data/train_type_count.pkl', 'w') as f: pickle.dump(total_count, f)
def loc_error_analysis(): dicts = joblib.load( '/home/zys133/knowledge_base/NFGEC/data/Wiki/dicts_figer.pkl') with open('./type_f1_file/attention_0_0_0_0_4.pickle', 'r') as data_file: data = pickle.load(data_file) print data[4].test_scores.shape print data[4].test_trues.shape print roc_auc_score( np.transpose(data[4].test_trues)[0], np.transpose(data[4].test_scores)[0]) data[4].test_scores = np.reshape(data[4].test_scores, (563)) figer_test = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_test_word_with_context.txt', \ context_file = 'data/state_of_the_art_test_tagged_context.txt', \ feature_file = 'data/state_of_the_art_test_Feature.npy', \ entity_type_feature_file = 'data/state_of_the_art_test_et_features.npy',\ entity_type_exact_feature_file = 'data/state_of_the_art_test_exact_et_features.npy',\ type_file = 'data/state_of_the_art_test_Types_with_context.npy') batch_data = figer_test.next_batch(range(0, 113)) # print batch_data[-1][1] # score of location # entity name # context # corret labels vob = figer_data_multi_label_batcher.Vocabulary() indices = np.argsort(data[4].test_scores)[::-1] for i in range(0, len(indices)): print '---' for j in range(0, 113): if batch_data[-1][indices[i]][j] != 0.0: print dicts['id2label'][j] for e in batch_data[0][indices[i]]: if vob.i2w(int(e)) == '_my_null_': continue print vob.i2w(int(e)) if i >= 102: break
def get_hearst_feature_predict(): figer_test = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_test_word_with_context.txt', \ context_file = 'data/state_of_the_art_test_tagged_context.txt', \ feature_file = 'data/state_of_the_art_test_Feature.npy', \ entity_type_feature_file = 'data/state_of_the_art_test_et_features.npy',\ entity_type_exact_feature_file = 'data/state_of_the_art_test_exact_et_features.npy',\ type_file = 'data/state_of_the_art_test_Types_with_context.npy') # batch_data = figer_test.next_batch(range(0, 113)) # figer_train = figer_data_multi_label_batcher.figer_data_multi_label() total_count = np.zeros((2, 2)) for i in range(0, 10): batch_data = figer_test.next_batch(range(0, 113)) t_count = get_counts(batch_data) total_count += t_count print(total_count)
def cap_ratio(): np.set_printoptions(suppress=True) with open('data/train_type_count.pkl', 'r') as f: total_count = pickle.load(f) print total_count vob = figer_data_multi_label_batcher.Vocabulary() figer = figer_data_multi_label_batcher.figer_data_multi_label() loc_cap_count = 0 person_cap_count = 0 for i in range(200): batch_data = figer.next_batch(range(0, 2)) for j in range(0, batch_data[0].shape[0]): if batch_data[-1][j][0] != 0.0: if vob.i2w(int(batch_data[0][j][0]))[0].isupper(): loc_cap_count += 1 if batch_data[-1][j][1] != 0.0: if vob.i2w(int(batch_data[0][j][1]))[0].isupper(): person_cap_count += 1 # break print loc_cap_count print person_cap_count
def type_freq_webisa_feature(): all_test_id = [] train_ids, dev_ids, test_ids = seen_type_dot_distance_label_matrix.get_CV_info( 'CV_output.txt') for i in range(0, 10): all_test_id += test_ids[i] print(all_test_id) with open('data/test_type_count.pkl', 'r') as f: b = pickle.load(f) figer = figer_data_multi_label_batcher.figer_data_multi_label() type_only = figer.next_batch(range(0, 113))[-2] dicts = joblib.load( '/home/zys133/knowledge_base/NFGEC/data/Wiki/dicts_figer.pkl') print('label, freq, pattern_matches, URL') for e in all_test_id: print('{}\t{}\t{:.4f}\t{:.4f}'.format(dicts['id2label'][e], b[e][1], type_only[0][e][0], type_only[0][e][2]))
def state_of_the_art_average_emb(): window_size = 10 entity_ids = tf.placeholder(tf.int32, [None, None], name='entity_ids') entity_raw_lens = tf.placeholder(tf.float32, [None], name='entity_raw_lens') l_context_ids = tf.placeholder(tf.int32, [None, window_size], name='l_context_ids') l_context_raw_lens = tf.placeholder(tf.float32, [None], name='l_context_raw_lens') r_context_ids = tf.placeholder(tf.int32, [None, window_size], name='r_context_ids') r_context_raw_lens = tf.placeholder(tf.float32, [None], name='r_context_raw_lens') word_emb = np.load('./data/word_emb.npy') word_emb_lookup_table = tf.Variable(word_emb, dtype=tf.float32, trainable=False, name='word_emb_lookup_table') l_context_embs = tf.nn.embedding_lookup(word_emb_lookup_table, l_context_ids) r_context_embs = tf.nn.embedding_lookup(word_emb_lookup_table, r_context_ids) entity_embs = tf.nn.embedding_lookup(word_emb_lookup_table, entity_ids) entity_lens = tf.reshape(entity_raw_lens, [-1, 1], name='entity_lens') l_context_lens = tf.reshape(l_context_raw_lens, [-1, 1], name='l_context_lens') r_context_lens = tf.reshape(r_context_raw_lens, [-1, 1], name='r_context_lens') drop_out = tf.placeholder(tf.float32) # l_context_embs_sum = tf.reduce_sum(l_context_embs, 1) # l_context_embs_ave = l_context_embs_sum / l_context_lens # r_context_embs_sum = tf.reduce_sum(r_context_embs, 1) # r_context_embs_ave = r_context_embs_sum / r_context_lens # context_embs_ave = tf.concat([l_context_embs_ave, r_context_embs_ave], 1, name = 'context_embs_ave') l_context_embs_sum = tf.reduce_sum(l_context_embs, 1) r_context_embs_sum = tf.reduce_sum(r_context_embs, 1) context_embs_sum = tf.concat([l_context_embs_sum, r_context_embs_sum], 1, name='context_embs_ave') entity_embs_sum = tf.reduce_sum(entity_embs, 1) entity_embs_ave = entity_embs_sum / entity_lens entity_embs_ave_dropout = tf.nn.dropout(entity_embs_ave, 1.0 - drop_out) # Xs = tf.concat([entity_embs_ave_dropout,context_embs_ave], 1) Xs = tf.concat([entity_embs_ave_dropout, context_embs_sum], 1) Ys = tf.placeholder(tf.float32, [None, 113], name='Ys') W = tf.Variable(tf.random_uniform([900, 113], minval=-0.01, maxval=0.01), name='W') logit = tf.matmul(Xs, W) predict_y = tf.sigmoid(logit) loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=Ys, logits=logit)) train_step = tf.train.AdamOptimizer(0.001).minimize(loss) figer = figer_data_multi_label_batcher.figer_data_multi_label() figer_test = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_test_word_with_context.txt', \ context_file = 'data/state_of_the_art_test_tagged_context.txt', \ type_file = 'data/state_of_the_art_test_Types_with_context.npy') config = tf.ConfigProto(device_count={'GPU': 0}) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(0, 5): figer.shuffle() np_store = [] for i in range(0, 2000): batch_entity_ids, batch_entity_raw_lens, batch_l_context_ids, batch_l_context_lens, \ batch_r_context_ids, batch_r_context_lens, batch_ys = figer.next_batch() _, print_loss = \ sess.run([train_step, loss],\ feed_dict={entity_ids: batch_entity_ids, entity_raw_lens: batch_entity_raw_lens, \ l_context_ids: batch_l_context_ids, l_context_raw_lens: batch_l_context_lens, \ r_context_ids: batch_r_context_ids, r_context_raw_lens: batch_r_context_lens, Ys: batch_ys, drop_out: [0.0]}) with open('temp.txt', 'w') as f: f.write('training epoch: {} batch {} : {}\n'.format( epoch, i, print_loss)) figer_test.train_pos = 0 batch_entity_ids, batch_entity_raw_lens, batch_l_context_ids, batch_l_context_lens, \ batch_r_context_ids, batch_r_context_lens, batch_ys = figer_test.next_batch() print_entity_embs_ave, print_predict_y = sess.run([entity_embs_ave, predict_y],\ feed_dict={entity_ids: batch_entity_ids, entity_raw_lens: batch_entity_raw_lens, \ l_context_ids: batch_l_context_ids, l_context_raw_lens: batch_l_context_lens, \ r_context_ids: batch_r_context_ids, r_context_raw_lens: batch_r_context_lens, drop_out: [0.0]}) evaluate.acc_hook(print_predict_y, batch_ys, epoch)
def pattern_baseline(): # unseen_label_ids = [76, 20, 18, 13, 2, 8, 3] # unseen_label_ids = [1, 0, 11, 76, 18, 13, 9] # unseen_label_ids = [4, 13, 38, 9] # unseen_label_ids = [2, 3, 8, 20, 5, 24, 38] all_unseen_label_ids = [] for id_select_flag in range(0, 10): train_ids, dev_ids, test_ids = get_CV_info('CV_output.txt') cv_id = id_select_flag % 10 for e in test_ids[cv_id]: all_unseen_label_ids.append(e) unseen_label_ids = all_unseen_label_ids unseen_label_ids = np.sort(unseen_label_ids) figer_test = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_test_word_with_context.txt', \ context_file = 'data/state_of_the_art_test_tagged_context.txt', \ feature_file = 'data/state_of_the_art_test_Feature.npy', \ entity_type_feature_file = 'data/state_of_the_art_test_et_features.npy',\ entity_type_exact_feature_file = 'data/state_of_the_art_test_exact_et_features.npy',\ type_file = 'data/state_of_the_art_test_Types_with_context.npy') predict_ys = np.zeros([0, len(unseen_label_ids)]) truth_ys = np.zeros([0, len(unseen_label_ids)]) for i in range(0, 1): batch_data = figer_test.next_batch(unseen_label_ids) print_predict_y = baseline_predict(batch_data[-3]) predict_ys = np.vstack((predict_ys, print_predict_y)) truth_ys = np.vstack((truth_ys, batch_data[-1])) print predict_ys.shape print truth_ys.shape F1, test_type_f1s, _ = evaluate.acc_hook_f1_break_down(predict_ys, truth_ys, 0, 0, 'baseline.txt') with open('data/test_type_count.pkl', 'r') as f: b = pickle.load(f) count_sum = 0 for e in unseen_label_ids: count_sum += b[e][1] F1_weighted = 0.0 p = 0 all_test_id = data_utility.get_all_unseen_types() for i in all_test_id: if i in unseen_label_ids: F1_weighted += (b[i][1]/count_sum) * test_type_f1s[p] p += 1 print 'pattern baseline type averaged macro = {}'.format(F1_weighted) print 'pattern baseline type micro = {}'.format(F1[2][2]) epoch_type_f1_info = data_utility.type_f1s() epoch_type_f1_info.add_test_auc(unseen_label_ids, batch_data[-1], print_predict_y) # # print epoch_type_f1_info.test_auc_dict auc_array = [] for k in epoch_type_f1_info.test_auc_dict: auc_array.append(epoch_type_f1_info.test_auc_dict[k]) print 'auc ave = {}'.format(np.average(auc_array))
def emb_baseline(): model_flag, feature_flag, entity_type_feature_flag, exact_entity_type_feature_flag, \ type_only_feature_flag, id_select_flag, log_path, log_head= my_argparse() with open(log_path, 'w') as f: f.write('{}\n'.format(log_head)) # 106/7 if id_select_flag == 0: test_unseen_label_ids = [1, 0, 11, 76, 18, 13, 9] seen_label_ids = [] dev_unseen_label_ids = [] for i in range(0,113): if not i in test_unseen_label_ids: seen_label_ids.append(i) dev_unseen_label_ids.append(i) elif id_select_flag == 1: test_unseen_label_ids = [2, 3, 8, 20, 5, 24, 38] seen_label_ids = [] dev_unseen_label_ids = [] for i in range(0,113): if not i in test_unseen_label_ids: seen_label_ids.append(i) dev_unseen_label_ids.append(i) # seen all types elif id_select_flag == 2: seen_label_ids = range(0, 113) dev_unseen_label_ids = range(0, 113) test_unseen_label_ids = range(0, 113) # CV elif id_select_flag >= 10 and id_select_flag < 20: train_ids, dev_ids, test_ids = get_CV_info('CV_output.txt') cv_id = id_select_flag % 10 seen_label_ids = train_ids[cv_id] dev_unseen_label_ids = dev_ids[cv_id] test_unseen_label_ids = test_ids[cv_id] elif id_select_flag >= 20 and id_select_flag < 30: train_ids, dev_ids, test_ids = get_CV_info('CV_output.txt') cv_id = id_select_flag % 10 seen_label_ids = train_ids[cv_id] dev_unseen_label_ids = seen_label_ids test_unseen_label_ids = test_ids[cv_id] seen_label_ids = np.sort(seen_label_ids) dev_unseen_label_ids = np.sort(dev_unseen_label_ids) test_unseen_label_ids = np.sort(test_unseen_label_ids) test_prior = np.full(test_unseen_label_ids.shape, 0.03) # 99/7/7 # if id_select_flag == 0: # test_unseen_label_ids = [1, 0, 11, 76, 18, 13, 9] # dev_unseen_label_ids = [2, 3, 8, 20, 5, 24, 38] # elif id_select_flag == 1: # test_unseen_label_ids = [2, 3, 8, 20, 5, 24, 38] # dev_unseen_label_ids = [1, 0, 11, 76, 18, 13, 9] # # seen_label_ids = [] # for i in range(0,113): # if (not i in test_unseen_label_ids) and (not i in dev_unseen_label_ids): # seen_label_ids.append(i) # original # test_unseen_label_ids = [76, 20, 18, 13, 2, 8, 3] # dev_unseen_label_ids = [76, 20, 18, 13, 2, 8, 3] # seen_label_ids = [] # # for i in range(0,113): # if not i in test_unseen_label_ids: # seen_label_ids.append(i) with tf.variable_scope("foo"): train_placeholders, train_train_step, train_loss, train_predict_y = \ create_model(1, seen_label_ids=seen_label_ids, model_flag=model_flag, feature_flag=feature_flag, \ entity_type_feature_flag=entity_type_feature_flag, exact_entity_type_feature_flag=exact_entity_type_feature_flag, \ type_only_feature_flag=type_only_feature_flag) tf.get_variable_scope().reuse_variables() test_placeholders, _, test_loss, test_predict_y = \ create_model(2, test_unseen_label_ids=test_unseen_label_ids, model_flag=model_flag, \ feature_flag=feature_flag, entity_type_feature_flag=entity_type_feature_flag, \ exact_entity_type_feature_flag=exact_entity_type_feature_flag, \ type_only_feature_flag=type_only_feature_flag) dev_placeholders, _, dev_loss, dev_predict_y = \ create_model(3, dev_unseen_label_ids=dev_unseen_label_ids, model_flag=model_flag, \ feature_flag=feature_flag, entity_type_feature_flag=entity_type_feature_flag, \ exact_entity_type_feature_flag=exact_entity_type_feature_flag, \ type_only_feature_flag=type_only_feature_flag) figer = figer_data_multi_label_batcher.figer_data_multi_label() figer_test = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_test_word_with_context.txt', \ context_file = 'data/state_of_the_art_test_tagged_context.txt', \ feature_file = 'data/state_of_the_art_test_Feature.npy', \ entity_type_feature_file = 'data/state_of_the_art_test_et_features.npy',\ entity_type_exact_feature_file = 'data/state_of_the_art_test_exact_et_features.npy',\ type_file = 'data/state_of_the_art_test_Types_with_context.npy') figer_dev = figer_data_multi_label_batcher.figer_data_multi_label(entity_file = 'data/state_of_the_art_dev_word_with_context.txt', \ context_file = 'data/state_of_the_art_dev_tagged_context.txt', \ feature_file = 'data/state_of_the_art_dev_Feature.npy', \ entity_type_feature_file = 'data/state_of_the_art_dev_et_features.npy',\ entity_type_exact_feature_file = 'data/state_of_the_art_dev_exact_et_features.npy',\ type_file = 'data/state_of_the_art_dev_Types_with_context.npy') training_F1s = [] dev_F1s = [] test_F1s = [] config = tf.ConfigProto( device_count = {'GPU': 0}, intra_op_parallelism_threads=4, inter_op_parallelism_threads=4 ) type_f1_info = [] with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(0, 5): figer.shuffle() epoch_type_f1_info = data_utility.type_f1s() for i in range(0, 2000): batch_data = figer.next_batch(seen_label_ids) feed_dict = dict(zip(train_placeholders, list(batch_data) + list([0.5]) + list([0.0]))) _, print_loss = sess.run([train_train_step, train_loss], feed_dict) with open('temp.txt', 'w') as f: f.write('training epoch: {} batch {} : {}\n'.format(epoch, i, print_loss)) # training performance predict_ys = np.zeros([0, len(seen_label_ids)]) truth_ys = np.zeros([0, len(seen_label_ids)]) for i in range(0, 200): batch_data = figer.next_batch(seen_label_ids) feed_dict = dict(zip(train_placeholders, list(batch_data) + list([0.0]) + list([0.0]))) print_predict_y = sess.run(train_predict_y, feed_dict) predict_ys = np.vstack((predict_ys, print_predict_y)) truth_ys = np.vstack((truth_ys, batch_data[-1])) with open('temp.txt', 'w') as f: f.write('testing train batch {} : {}\n'.format(i, print_loss)) F1, train_type_f1s, train_F1_num = evaluate.acc_hook_f1_break_down(predict_ys, truth_ys, epoch, 1, log_path) training_F1s.append(F1) epoch_type_f1_info.add_f1s(seen_label_ids, train_type_f1s, train_F1_num, 0) # dev performance predict_ys = np.zeros([0, len(dev_unseen_label_ids)]) truth_ys = np.zeros([0, len(dev_unseen_label_ids)]) for i in range(0, 10): batch_data = figer_dev.next_batch(dev_unseen_label_ids) feed_dict = dict(zip(dev_placeholders, list(batch_data) + list([0.0]) + list([0.0]))) print_predict_y = sess.run(dev_predict_y, feed_dict) predict_ys = np.vstack((predict_ys, print_predict_y)) truth_ys = np.vstack((truth_ys, batch_data[-1])) with open('temp.txt', 'w') as f: f.write('testing dev batch {} : {}\n'.format(i, print_loss)) F1, dev_type_f1s, dev_F1_num = evaluate.acc_hook_f1_break_down(predict_ys, truth_ys, epoch, 2, log_path) dev_F1s.append(F1) epoch_type_f1_info.add_f1s(dev_unseen_label_ids, dev_type_f1s, dev_F1_num, 1) # test performance batch_data = figer_test.next_batch(test_unseen_label_ids) feed_dict = dict(zip(test_placeholders, list(batch_data) + list([0.0]) + list([0.0]))) print_predict_y = sess.run(test_predict_y, feed_dict) np.save('temp_predict', print_predict_y) np.save('temp_truth', batch_data[-1]) F1, test_type_f1s, test_F1_num = evaluate.acc_hook_f1_break_down(print_predict_y, batch_data[-1], epoch, 0, log_path, prior=test_prior) test_F1s.append(F1) epoch_type_f1_info.add_f1s(test_unseen_label_ids, test_type_f1s, test_F1_num, 2) epoch_type_f1_info.add_test_auc(test_unseen_label_ids, batch_data[-1], print_predict_y) type_f1_info.append(epoch_type_f1_info) if log_path != 'loss_record.txt': print ('./base_line_type_f1_file/' + log_path[22:-4] + '.pickle') with open('./base_line_type_f1_file/' + log_path[22:-4] + '.pickle', 'w') as outfile: pickle.dump(type_f1_info, outfile) # dev_ave = np.average(dev_F1s, 1) # dev_F1s = np.asarray(dev_F1s, dtype=np.float32) # print dev_F1s.shape # print dev_ave.shape # while True: # pass dev_max_id = np.argmax(dev_F1s, 0)[2][2] max_dev_test_micro = test_F1s[dev_max_id][2][2] max_test_micro = np.amax(test_F1s, 0)[2][2] max_dev_test_macro = test_F1s[dev_max_id][1][2] max_test_macro = np.amax(test_F1s, 0)[1][2] with open(log_path, 'a') as f: f.write('max__d_e_v__t_e_s_t__macro__micro= {:.4f}\t{:.4f}\n'.format(max_dev_test_macro, max_dev_test_micro)) f.write('max__t_e_s_t__macro__micro= {:.4f}\t{:.4f}\n'.format(max_test_macro, max_test_micro))