def main(): # loading training feature data dir_path = os.path.dirname(os.path.realpath(__file__)) data = json.load(open(dir_path + "/../../../feats_all_ng_dep_prior.json")) output_file = open("window_size_output_file.tsv", "w") query_folder = "/home/smsarwar/PycharmProjects/civilian_killing/data/query_configurations/iterative_query_lr_dir" # allfeats will hold all the features in the training data and their weights onlyfiles = [ os.path.join(query_folder, file) for file in os.listdir(query_folder) if os.path.isfile(os.path.join(query_folder, file)) ] # print onlyfiles #results = {} number_of_files = 50 #query_terms = np.arange(10, 41, 10) window_sizes = np.asarray([2, 3, 5, 10, 15, 20, 25, 30]) res = np.zeros((4, len(window_sizes))) for idx, file in enumerate(onlyfiles[:number_of_files]): #results[file] = {} print file lines = open(file).readlines() lines = [line.strip() for line in lines] line = lines[20] sentence_ids = line.strip().split() allfeats = [] #loading all the features for item in data.keys(): if item in sentence_ids: feats = {} for feature in data[item]: feats[feature] = 1 allfeats.append(feats) #prior feats will contain all the features in the training dataset prior_feats = {} for item in data.keys(): prior_feats = merge_two_dicts(prior_feats, data[item]) allfeats.append(prior_feats) features, weights = transform_data_and_train(allfeats) name_set = load_gold() id_dict, train_id_dict = load_dictionary() #query_terms = np.arange(1, 1002, 100) #number_of_query_terms = min(len(features), 200) #query_terms = np.arange(1, 1002, 100) terms = [] p10 = [] p20 = [] p30 = [] num_civilian = [] query_number = 0 number_of_query_terms = min(len(features), 200) for window_size in window_sizes: file_temp = open("indri_lr_query_file.xml", "w") file_temp.write(get_query_prefix()) query = prepare_indri_query_top_k_features( features, weights, query_number, window_size=window_size, topk=number_of_query_terms, prf=False) file_temp.write(query + '\n') query_number += 1 file_temp.write(get_query_suffix()) file_temp.close() print "now querying" cmd = "IndriRunQuery indri_lr_query_file.xml" output = sp.check_output(cmd.split()) # #print output precs10, precs20, precs30, number_of_civilians_found, name_of_civilians = get_metric_values( output.split("\n"), id_dict, name_set, train_id_dict) # #results.append((number_of_query_terms, precs10, precs20, precs30, number_of_civilians_found)) terms.append(window_size) p10.append(precs10) p20.append(precs20) p30.append(precs30) num_civilian.append(number_of_civilians_found) res[0] += np.asarray(p10) res[1] += np.asarray(p20) res[2] += np.asarray(p30) res[3] += np.asarray(num_civilian) # print res[0]/(idx+1) # print res[1]/(idx+1) # print res[2]/(idx+1) # print res[3]/(idx+1) res_P10 = '' res_P20 = '' res_P30 = '' res_num_civilian = '' for index in np.arange(len(window_sizes)): res_P10 += str(res[0][index] / (idx + 1)) + "\t" res_P20 += str(res[1][index] / (idx + 1)) + "\t" res_P30 += str(res[2][index] / (idx + 1)) + "\t" res_num_civilian += str(res[3][index] / (idx + 1)) + "\t" print window_sizes print res_P10.strip() print res_P20.strip() print res_P30.strip() print res_num_civilian.strip() output_file.write(str(window_sizes) + "\n") output_file.write(res_P10.strip() + "\n") output_file.write(res_P20.strip() + "\n") output_file.write(res_P30.strip() + "\n") output_file.write(res_num_civilian.strip() + "\n") output_file.write( "-----------------------------------------------------\n") output_file.close()
def main(): plotting = False # loading training feature data dir_path = os.path.dirname(os.path.realpath(__file__)) data = json.load(open(dir_path + "/../../../feats_all_ng_dep_prior.json")) query_folder = "/home/smsarwar/PycharmProjects/civilian_killing/data/query_configurations/iterative_query_lr_dir" # allfeats will hold all the features in the training data and their weights onlyfiles = [ os.path.join(query_folder, file) for file in os.listdir(query_folder) if os.path.isfile(os.path.join(query_folder, file)) ] # print onlyfiles #results = {} number_of_files = 50 results = np.zeros((30, 4)) for idx, file in enumerate(onlyfiles): #results[file] = {} query_number = 1 file_temp = open("indri_lr_query_file.xml", "w") file_temp.write(get_query_prefix()) for line in open(file): sentence_ids = line.strip().split() allfeats = [] #loading all the features for item in data.keys(): if item in sentence_ids: feats = {} for feature in data[item]: feats[feature] = 1 allfeats.append(feats) #prior feats will contain all the features in the training dataset prior_feats = {} for item in data.keys(): prior_feats = merge_two_dicts(prior_feats, data[item]) #print 'length of the feature set ' + str(len(prior_feats)) allfeats.append(prior_feats) features, weights = transform_data_and_train(allfeats) name_set = load_gold() id_dict, train_id_dict = load_dictionary() #query_terms = np.arange(1, 1002, 100) #query_terms = np.arange(1, 10, 10) number_of_query_terms = min(len(features), 200) #query_terms = [5] terms = [] p10 = [] p20 = [] p30 = [] num_civilian = [] query = prepare_indri_query_top_k_features( features, weights, query_number, topk=number_of_query_terms, prf=False) file_temp.write(query + '\n') #print(query + "\n") query_number += 1 file_temp.write(get_query_suffix()) file_temp.close() # print "now querying" # cmd = "IndriRunQuery indri_lr_query_file.xml" # output = sp.check_output(cmd.split()) # run_file = open("../../../data/runs/iterative_lr_ir_run_dir/" + os.path.basename(file) + ".run", "w") # run_file.write(output) # run_file.close() # #print output # precs10, precs20, precs30, number_of_civilians_found, name_of_civilians = get_metric_values(output.split("\n"), id_dict, name_set, train_id_dict) # #results.append((number_of_query_terms, precs10, precs20, precs30, number_of_civilians_found)) # terms.append(number_of_query_terms) # p10.append(precs10) # p20.append(precs20) # p30.append(precs30) # num_civilian.append(number_of_civilians_found) # results[index][0]+=precs10 # results[index][1]+= precs20 # results[index][2]+= precs30 # results[index][3]+= number_of_civilians_found # print results[index]/(idx+1) # index+=1 # file = open("2.xml.run", 'w+') # for i in np.arange(30): # file.write(output) # file.close() # # cmd = "mv 2.xml.run /home/smsarwar/PycharmProjects/civilian_killing/data/runs/iterative_lr_ir_run_dir/" # output = sp.check_output(cmd.split()) # #print "querying done" # #print output # # cmd = "python /home/smsarwar/PycharmProjects/civilian_killing/src/eval/entity_level_evaluation.py" # output = sp.check_output(cmd.split()) # print "evaluation done" # print output #print p10 #print p20 #print p30 #print num_civilian print results
def main(): # loading training feature dataa dir_path = os.path.dirname(os.path.realpath(__file__)) data = json.load(open(dir_path + "/../../../feats_all_ng_dep_prior.json")) # allfeats will hold all the features in the training data and their weights allfeats = [] print len(data.keys()) #loading all the features for item in data.keys(): feats = {} for feature in data[item]: feats[feature] = 1 allfeats.append(feats) #prior feats will contain all the features in the training dataset prior_feats = {} for item in data.keys(): prior_feats = merge_two_dicts(prior_feats, data[item]) print 'length of the feature set ' + str(len(prior_feats)) allfeats.append(prior_feats) features, weights = transform_data_and_train(allfeats) name_set = load_gold() id_dict, train_id_dict = load_dictionary() query_terms = np.arange(1, 1002, 100) terms = [] p10 = [] p20 = [] p30 = [] num_civilian = [] for number_of_query_terms in query_terms: file = open("indri_lr_query_file.xml", "w+") file.write(prepare_indri_query_top_k_features(features, weights, topk=number_of_query_terms)) file.close() print "now querying" cmd = "IndriRunQuery indri_lr_query_file.xml" output = sp.check_output(cmd.split()) #print output precs10, precs20, precs30, number_of_civilians_found, name_of_civilians = get_metric_values(output.split("\n"), id_dict, name_set, train_id_dict) #results.append((number_of_query_terms, precs10, precs20, precs30, number_of_civilians_found)) terms.append(number_of_query_terms) p10.append(precs10) p20.append(precs20) p30.append(precs30) num_civilian.append(number_of_civilians_found) # file = open("2.xml.run", 'w+') # for i in np.arange(30): # file.write(output) # file.close() # # cmd = "mv 2.xml.run /home/smsarwar/PycharmProjects/civilian_killing/data/runs/iterative_lr_ir_run_dir/" # output = sp.check_output(cmd.split()) # #print "querying done" # #print output # # cmd = "python /home/smsarwar/PycharmProjects/civilian_killing/src/eval/entity_level_evaluation.py" # output = sp.check_output(cmd.split()) # print "evaluation done" # print output print p10 print p20 print p30 print num_civilian f, axarr = plt.subplots(2, 2) # axarr[0, 0].plot(terms, p10) # axarr[0, 0].set_title('Precision@10') # axarr[0, 1].scatter(terms, p20) # axarr[0, 1].set_title('Precision@20') # axarr[1, 0].plot(terms, p30) # axarr[1, 0].set_title('Precision@30') # axarr[1, 1].scatter(terms, number_of_civilians_found) # axarr[1, 1].set_title('\#civilians (top-1000)') # # Fine-tune figure; hide x ticks for top plots and y ticks for right plots # plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False) # plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False) #print terms axarr[0, 0].plot(terms, p10, marker='o', markerfacecolor='black', markersize=6, color='darkgray', linewidth=2, label='P@10', linestyle='-.') axarr[0, 0].set_title('Precision@10') axarr[0, 1].plot(terms, p20, marker='v', markerfacecolor='blue', markersize=6, color='gray', linewidth=2, label='P@20', linestyle='--') axarr[0, 1].set_title('Precision@20') axarr[1, 0].plot(terms, p30, marker='x', markerfacecolor='red', markersize=6, color='black', linewidth=2, label='P@30', linestyle=':') axarr[1, 0].set_title('Precision@30') axarr[1, 1].plot(terms, num_civilian, marker='s', markerfacecolor='red', markersize=6, color='black', linewidth=2, label='P@30', linestyle=':') axarr[1, 1].set_title('Relevant entities') f.text(0.5, 0.005 , 'Number of Features in Query', ha='center', fontsize=10) #plt.xlabel('Number of Examples in Query') # plt.ylabel('Precision@100') # plt.legend() # plt.savefig("p@100") # plt.show() # f.subplots_adjust(hspace=0.3) plt.show()