def train_data_generation_samestructure_graphq(propertys, files, qid_abstractquestions): data_for_train_list = list() for i, file in enumerate(files): print(i, file) data = read_structure_file(file) qid = file.split('/')[-1].split('.')[0] if len(qid_abstractquestions[qid]) == 0: continue negatives = list() j = 0 # join=True for structure in data: gold_path = [] predicates = [] # for edge in structure.gold_graph_query.edges: # gold_path.append(edge.relation) # predicates.append(edge.relation) for edge in structure.gold_sparql_query['edges']: gold_path.append(edge['relation']) predicates.append(edge['relation']) gold_path.sort() gold_path = '\t'.join(gold_path) for ungrounded_graph in structure.ungrounded_graph_forest: for grounded_graph in ungrounded_graph.grounded_graph_forest: path = grounded_graph.key_path ps = path.split('\t') ps.sort() path = '\t'.join(ps) if j < model_parameters.neg_size and len(ps) == len( predicates) and path != gold_path: negatives.append(path) j += 1 if j > 0: if j < model_parameters.neg_size: while j < model_parameters.neg_size: candidate = list() for i in range(len(predicates)): candidate.append(propertys[random.randint( 0, len(propertys) - 1)]) candidate.sort() candidate = "\t".join(candidate) if candidate != gold_path and candidate not in negatives: negatives.append(candidate) j += 1 one = dict() one["qid"] = qid one["abstractquestion"] = (qid_abstractquestions[qid]) one["gold_path"] = gold_path one["negatives"] = negatives data_for_train_list.append(one) else: print('not join', qid) write_json( data_for_train_list, fn_graph_file.path_match_dir + "data_for_trainorval_list_samestructure.json")
def generate_graphq_gold_triple_path_annotation(graphq_annotation_list, graphq_gold_nodes, output_file): qid_graphq_nodes = {} for one in graphq_gold_nodes: qid_graphq_nodes[one['qid']] = one for i in range(len(graphq_annotation_list)): graphq_struct_one = graphq_annotation_list[i] qid = graphq_struct_one.qid print(qid) if qid not in qid_graphq_nodes: continue triples_ = score12_utils.get_triples_by_grounded_graph_edges_graphq( nodes=graphq_struct_one.nodes, edges=graphq_struct_one.edges) reverse_triples_list = triple_enum.get_all_reverse_triples( triples=triples_, property_reverse_dict=property_reverse_dict) reverse_paths_list = [] for reverse_triples in reverse_triples_list: reverse_paths_list.append( score12_utils.triples_to_path_list(triples=reverse_triples, _root_id='?x')) qid_graphq_nodes[qid]['gold'][ 'path'] = score12_utils.triples_to_path_list(triples=triples_, _root_id='?x') qid_graphq_nodes[qid]['gold'][ 'reverse_paths_list'] = reverse_paths_list write_json(graphq_gold_nodes, fn_graph_file.score12_match + output_file)
def generate_trainset(): trainset = [] train_predicate_qids = read_json(data_question_match + 'train_predicate_qids.json') qid_abstractquestions = read_json(data_question_match + 'qid_abstractquestion.json') abstractquestion_all = set() for predicate in train_predicate_qids: for qid in train_predicate_qids[predicate]: #"train_WebQTrn-3513_7c4117891abf63781b892537979054c6", if qid in qid_abstractquestions: abstractquestion_all.add(qid_abstractquestions[qid]) for k, predicate in enumerate(train_predicate_qids): print(k, predicate) same_abstractquestions = set() for qid in train_predicate_qids[predicate]: if qid in qid_abstractquestions: same_abstractquestions.add(qid_abstractquestions[qid]) residu_abstractquestions = (list(abstractquestion_all - same_abstractquestions)) same_abstractquestions = list(same_abstractquestions)[:10] for first, current in enumerate(same_abstractquestions): for second, gold in enumerate(same_abstractquestions): if current != gold: random.shuffle(residu_abstractquestions) neg_samples = residu_abstractquestions[:50] trainset.append([current, gold, 1]) for neg in neg_samples: trainset.append([current, neg, 0]) # if len(same_abstractquestions)>1: # current=list(same_abstractquestions)[0] # gold=list(same_abstractquestions)[1] # random.shuffle(residu_abstractquestions) # neg_samples = residu_abstractquestions[:20] # trainset.append([current,gold,1]) # for neg in neg_samples: # trainset.append([current, neg, 0]) # # trainset.append([current, gold, neg_samples]) # current = list(same_abstractquestions)[1] # gold = list(same_abstractquestions)[0] # random.shuffle(residu_abstractquestions) # neg_samples = residu_abstractquestions[:20] # trainset.append([current, gold, 1]) # for neg in neg_samples: # trainset.append([current, neg, 0]) # # trainset.append([current, gold, neg_samples]) write_json(trainset, data_question_match + 'trainset.json')
def generate_graphq_gold_node_annotation_w_deppath(is_deppath=False): from method_sp.parsing import node_recognition train_result = [] test_result = [] questionnormal_function_abstract_question_nodes = dict() for one in annotation_node_questions_json: nodes = [] for node in one['node_mention_nju']: if node['tag'] == 'entity' and node['uri'] is not None: nodes.append(node) elif node['tag'] == 'literal' and node['uri'] is not None: nodes.append(node) ann_dict = { 'function': 'none', 'abstract_question': one['abstract_question'], 'nodes': nodes } if is_deppath: ungrounded_nodes = node_recognition.generate_gold_nodes( question_normal=one['question_normal']) abstract_question_deppath_list = dep_to_path.get_deppath_list( question_normal=one['question_normal'], ungrounded_nodes=ungrounded_nodes, isSkeletonorDep='Skeleton') ann_dict[ 'abstract_question_deppath'] = abstract_question_deppath_list questionnormal_function_abstract_question_nodes[ one['question_normal']] = ann_dict for one in train_graph_questions_struct: new_one = {} new_one['qid'] = one.qid new_one['question_normal'] = one.question if one.question in questionnormal_function_abstract_question_nodes: new_one['gold'] = questionnormal_function_abstract_question_nodes[ one.question] new_one['gold']['function'] = one.function train_result.append(new_one) for one in test_graph_questions_struct: new_one = {} new_one['qid'] = one.qid new_one['question_normal'] = one.question if new_one[ 'question_normal'] in questionnormal_function_abstract_question_nodes: new_one['gold'] = questionnormal_function_abstract_question_nodes[ one.question] new_one['gold']['function'] = one.function test_result.append(new_one) write_json( train_result, fn_graph_file.score12_match + "train_graphq_gold_node_0124.json") write_json(test_result, fn_graph_file.score12_match + "test_graphq_gold_node_0124.json")
def generate_cwq_gold_triple_path_annotation(cwq_annotation_list, qid_to_grounded_graph_dict, cwq_gold_nodes, output_file, is_deppath=False): from method_sp.parsing import node_recognition qid_graphq_nodes = {} for one in cwq_gold_nodes: qid_graphq_nodes[one['qid']] = one print(len(cwq_annotation_list)) for i in range(len(cwq_annotation_list)): graphq_struct_one = cwq_annotation_list[i] qid = graphq_struct_one.ID if qid not in qid_graphq_nodes: continue if qid not in qid_to_grounded_graph_dict: continue print(i, qid) if is_deppath: question_normal = graphq_struct_one.question '''gold''' # ungrounded_nodes = node_recognition.generate_gold_nodes(question_normal=question_normal) '''system''' tokens = parsing_utils.create_tokens(question_normal.split(" ")) ungrounded_nodes = node_recognition.generate_nodes( question_normal=question_normal, tokens=tokens) abstract_question_deppath_list = dep_to_path.get_deppath_list( question_normal=question_normal, ungrounded_nodes=ungrounded_nodes, isSkeletonorDep='Dep') qid_graphq_nodes[qid]['gold'][ 'abstract_question_deppath'] = abstract_question_deppath_list gold_grounded_graph = qid_to_grounded_graph_dict[qid] triples_ = score12_utils.get_triples_by_grounded_graph_edges( nodes=gold_grounded_graph.nodes, edges=gold_grounded_graph.edges) reverse_triples_list = triple_enum.get_all_reverse_triples( triples=triples_, property_reverse_dict=property_reverse_dict) reverse_paths_list = [] for index, reverse_triples in enumerate(reverse_triples_list): reverse_paths_list.append( score12_utils.triples_to_path_list(triples=reverse_triples, _root_id='?x')) qid_graphq_nodes[qid]['gold'][ 'path'] = score12_utils.triples_to_path_list(triples=triples_, _root_id='?x') qid_graphq_nodes[qid]['gold'][ 'reverse_paths_list'] = reverse_paths_list write_json(cwq_gold_nodes, fn_cwq_file.score12_match + output_file)
def run_lcquad(data_type, output_file): from datasets_interface.question_interface import lcquad_1_0_interface ann_data_list = [] lcquad_list = [] if data_type == 'train': lcquad_list = lcquad_1_0_interface.lcquad_train_list elif data_type == 'test': lcquad_list = lcquad_1_0_interface.lcquad_test_list for i, lcquad_struct in enumerate(lcquad_list): question_normal = lcquad_struct.question_normal print(lcquad_struct.qid) entities_list = lcquad_1_0_interface.get_topic_entities_by_question(question_normal) abstract_question = lcquad_1_0_interface.get_abstract_question_by_question(question=question_normal) parsed_sparql = lcquad_struct.parsed_sparql sparql = lcquad_struct.sparql gold_triples = ir_online_utils.get_triples_by_sparql_json(parsed_sparql) gold_path = ir_online_utils.convert_triples_to_path(triples=gold_triples) gold = {} gold['question_type'] = ir_online_utils.get_question_type_by_sparql_json(sparql_json=parsed_sparql) gold['topic_entities'] = entities_list gold['aggregation_function'] = ir_online_utils.get_aggregation_function_by_sparql_json(sparq_json=parsed_sparql) gold['type_constraints'] = ir_online_utils.get_type_constraints_by_sparql_json(sparql_json=parsed_sparql) gold['gold_path'] = gold_path gold['gold_triples'] = gold_triples gold['sparql'] = sparql data = { 'qid': lcquad_struct.qid, 'question_normal': lcquad_struct.question_normal, 'abstract_question': abstract_question, 'gold': gold, 'no_positive_path': True, 'hop1': [], 'hop2': [] } topic_entities_with_types = ir_online_utils.topic_entities_with_t(entities_list=entities_list) # hop1, hop2 = _get_hop1_hop2_by_enum_grounded_graphs(topic_entities_with_types) try: hop1, hop2 = _get_hop1_hop2_by_online(topic_entities_with_types=topic_entities_with_types) except Exception as e: hop1, hop2 = [], [] print('Error:\t', question_normal) data['hop1'] = hop1 data['hop2'] = hop2 data['no_positive_path'] = ir_online_utils.is_exist_gold_path(hop_list=hop1 + hop2, gold_path=gold_path) ann_data_list.append(data) write_json(ann_data_list, output_file)
def grounded_graphes_by_score_standard_ywsun_prediction_test(input_file): from common.hand_files import write_json all_structure_path = os.listdir(input_file) # all_f1_score = 0 prediction_list = [] for structure_path in all_structure_path: print(structure_path) structure_list = read_structure_file(input_file + structure_path) score_to_queryid_sparql = collections.defaultdict(list) # grounded_query_id_to_f1_denotation = collections.defaultdict(set) grounded_query_id_to_denotation = collections.defaultdict(set) qid = None for structure in structure_list: qid = structure.qid for ungrounded_graph in structure.ungrounded_graph_forest: # ungrounded_graph_edges_num = len(ungrounded_graph.edges) for grounded_graph in ungrounded_graph.get_grounded_graph_forest( ): # grounded_graph_edges_num = len(grounded_graph.edges) # edge constaints # if grounded_graph_edges_num != ungrounded_graph_edges_num: continue # score_to_queryid_sparql[grounded_graph.score].append(grounded_graph.grounded_query_id) #word level matcher score_to_queryid_sparql[grounded_graph.total_score].append( grounded_graph.grounded_query_id) # grounded_query_id_to_f1_denotation[grounded_graph.grounded_query_id] = grounded_graph.f1_score grounded_query_id_to_denotation[ grounded_graph. grounded_query_id] = grounded_graph.denotation answers = [] score_to_queryid_sparql = dict( sorted(score_to_queryid_sparql.items(), key=lambda d: d[0], reverse=True)) for totalscore, grounded_query_ids in score_to_queryid_sparql.items(): for grounded_query_id in grounded_query_ids: answers = grounded_query_id_to_denotation[grounded_query_id] # all_f1_score += f1_score # top1id = grounded_query_id break break q_dict = dict() q_dict['ID'] = qid q_dict['answers_id'] = answers prediction_list.append(q_dict) write_json(prediction_list, './20191113_cwq_wo_wordlevel_prediction_test.json')
def run_cwq(data_type, output_file): from datasets_interface.question_interface import complexwebquestion_interface ann_data_list = [] complexwebq_struct_list = [] if data_type == 'train': complexwebq_struct_list = complexwebquestion_interface.complexwebq_train_list elif data_type == 'test': complexwebq_struct_list = complexwebquestion_interface.complexwebq_test_list elif data_type == 'dev': complexwebq_struct_list = complexwebquestion_interface.complexwebq_dev_list for i, complexwebq_struct in enumerate(complexwebq_struct_list): question_normal = complexwebq_struct.question print(complexwebq_struct.ID) entities_list = complexwebquestion_interface.get_topic_entities_by_question(question_normal) abstract_question = complexwebquestion_interface.get_abstract_question_by_question(question=question_normal) parsed_sparql = complexwebq_struct.parsed_sparql sparql = complexwebq_struct.sparql gold_triples = ir_online_utils.get_triples_by_sparql_json(parsed_sparql) gold_path = ir_online_utils.convert_triples_to_path(triples=gold_triples) gold = {} gold['question_type'] = ir_online_utils.get_question_type_by_sparql_json(sparql_json=parsed_sparql) gold['topic_entities'] = entities_list gold['aggregation_function'] = ir_online_utils.get_aggregation_function_by_sparql_json(sparq_json=parsed_sparql) gold['type_constraints'] = ir_online_utils.get_type_constraints_by_sparql_json(sparql_json=parsed_sparql) gold['gold_path'] = gold_path gold['gold_triples'] = gold_triples gold['sparql'] = sparql data = { 'qid': complexwebq_struct.ID, 'question_normal': question_normal, 'abstract_question': abstract_question, 'gold': gold, 'no_positive_path': True, 'hop1': [], 'hop2': [] } topic_entities_with_types = ir_online_utils.topic_entities_with_t(entities_list=entities_list) # hop1, hop2 = _get_hop1_hop2_by_enum_grounded_graphs(topic_entities_with_types) hop1, hop2 = _get_hop1_hop2_by_online(topic_entities_with_types=topic_entities_with_types) data['hop1'] = hop1 data['hop2'] = hop2 data['no_positive_path'] = ir_online_utils.is_exist_gold_path(hop_list=hop1 + hop2, gold_path=gold_path) ann_data_list.append(data) break write_json(ann_data_list, output_file)
def generate_cwq_train_candidates_paths_from_structure( cwq_gold_path_list, train_candidates_sp_path_top_path, output_file): files = os.listdir(train_candidates_sp_path_top_path) new_cwq_path_list = [] for one in cwq_gold_path_list: print(one['qid']) if str(one['qid']) + '.json' not in files: continue if 'path' not in one['gold']: continue new_one = dict() new_one['qid'] = one['qid'] new_one['question_normal'] = one['question_normal'] new_one['gold'] = one['gold'] test_candidates_sp = read_structure_file( train_candidates_sp_path_top_path + str(one['qid']) + '.json') test_candidates_sp = test_candidates_sp[0] ungrounded_graph = test_candidates_sp.ungrounded_graph_forest[-1] hop1, hop2, hop3, hop4 = score12_utils.grounded_graph_list_to_path_list( ungrounded_graph.get_grounded_graph_forest()) hops = [] if len(hop1) > 0: new_one['gold']['hop1'] = hop1 hops += hop1 if len(hop2) > 0: new_one['gold']['hop2'] = hop2 hops += hop2 if len(hop3) > 0: new_one['gold']['hop3'] = hop3 hops += hop3 if len(hop4) > 0: new_one['gold']['hop4'] = hop4 hops += hop4 goldpath = None for hop in hops: for i, temp_goldpath in enumerate( new_one['gold']['reverse_paths_list']): if score12_utils.eq_paths(temp_goldpath, hop): goldpath = temp_goldpath break if goldpath is not None: new_one['gold']['path'] = goldpath del new_one['gold']['reverse_paths_list'] new_cwq_path_list.append(new_one) write_json(new_cwq_path_list, fn_cwq_file.score12_match + output_file)
def show_f1_given_qids(input_file, qids): qid_f1 = dict() all_data_path = os.listdir(input_file) for path in all_data_path: if path.split('.')[0] in qids: structure_with_grounded_graphq_file = input_file + path structure_list = read_structure_file( structure_with_grounded_graphq_file) print(path) max_f1 = 0 for structure in structure_list: for ungrounded_graph in structure.ungrounded_graph_forest: for grounded_graph in ungrounded_graph.get_grounded_graph_forest( ): if max_f1 < grounded_graph.f1_score: max_f1 = grounded_graph.f1_score qid_f1[path.split('.')[0]] = max_f1 write_json(qid_f1, 'qid_f1.json')
def run_graphquestions(data_type, output_file): from datasets_interface.question_interface import graphquestion_interface ann_data_list = [] graphq_struct_list = [] if data_type == 'train': graphq_struct_list = graphquestion_interface.train_graph_questions_struct elif data_type == 'test': graphq_struct_list = graphquestion_interface.test_graph_questions_struct for i, graphq_struct in enumerate(graphq_struct_list): question_normal = graphq_struct.question print(graphq_struct.qid) entities_list = graphquestion_interface.get_topic_entities_by_question(question_normal) abstract_question = graphquestion_interface.get_abstract_question_by_question(question=question_normal) parsed_sparql = graphq_struct.parsed_sparql sparql = graphq_struct.sparql_query # gold_triples = generate_utils.get_triples_by_sparql_json(parsed_sparql) gold_triples = ir_online_utils.get_triples_by_grounded_graph_edges(nodes=graphq_struct.nodes, edges=graphq_struct.edges) gold_path = ir_online_utils.convert_triples_to_path(triples=gold_triples) gold = {} gold['question_type'] = ir_online_utils.get_question_type_by_sparql_json(sparql_json=parsed_sparql) gold['topic_entities'] = entities_list gold['aggregation_function'] = ir_online_utils.get_aggregation_function_by_sparql_json(sparq_json=parsed_sparql) gold['type_constraints'] = ir_online_utils.get_type_constraints_by_sparql_json(sparql_json=parsed_sparql) gold['gold_path'] = gold_path gold['gold_triples'] = gold_triples gold['sparql'] = sparql data = { 'qid': graphq_struct.qid, 'question_normal': graphq_struct.question, 'abstract_question': abstract_question, 'gold': gold, 'no_positive_path': True, 'hop1': [], 'hop2': [] } topic_entities_with_types = ir_online_utils.topic_entities_with_t(entities_list=entities_list) # hop1, hop2 = _get_hop1_hop2_by_enum_grounded_graphs(topic_entities_with_types) hop1, hop2 = _get_hop1_hop2_by_online(topic_entities_with_types) data['hop1'] = hop1 data['hop2'] = hop2 data['no_positive_path'] = ir_online_utils.is_exist_gold_path(hop_list=hop1 + hop2, gold_path=gold_path) ann_data_list.append(data) write_json(ann_data_list, output_file)
def investigate_denotation_same(): testqid_trainqid_bertmax = read_json(data_question_match + 'testqid_trainqid_bertmax.json') qmi = QuestionMatchInterface() structure_2_2_files = '/2.2_test_span_transfer_wo_wordlevel/' all_data_path = os.listdir(output_path + structure_2_2_files) for path in all_data_path: print(path) test_qid = path.split('.')[0] test_qid = 'test_' + str(test_qid) # if 'test_'+str(test_qid) not in testqid_trainqid_bertmax: if test_qid not in testqid_trainqid_bertmax: continue # structure_with_grounded_graphq_file = output_path + structure_2_2_files + path structure_list = read_structure_file(output_path + structure_2_2_files + path) for structure in structure_list: for ungrounded_graph in structure.ungrounded_graph_forest: nodes = [] for groundedgraph in ungrounded_graph.get_grounded_graph_forest( ): nodes = groundedgraph.nodes break # print(test_qid) # denotation = set(qmi.get_denotation_by_testqid_nodes(test_qid, nodes)) denotation = set( qmi.get_denotation_by_testqid_nodes_freebase( test_qid, nodes)) print('denotations:', denotation) # gold_mids = set() # for one in structure.gold_answer: # gold_mids.add(one['answer_id']) # # if (len(denotation-gold_mids)==0 and len(gold_mids-denotation)==0): # print('oh no',test_qid) # if test_qid in qmunique_qids: # print('double oh no') write_json( qmi.testqid_correspondingtrainqid_denotations, data_question_match + 'testqid_correspondingtrainqid_denotations.json')
def generate_testset(): testset = [] test_2_1 = read_structure_file(test_structure_with_2_1_grounded_graph_file) train_predicate_qids = read_json(data_question_match + 'train_predicate_qids.json') qid_abstractquestions = read_json(data_question_match + 'qid_abstractquestion.json') train_abstractquestion = set() for predicate in train_predicate_qids: for qid in train_predicate_qids[predicate]: if qid in qid_abstractquestions: train_abstractquestion.add(qid_abstractquestions[qid]) test_abstractquestions = set() for one in test_2_1: if 'test_' + str(one.qid) in qid_abstractquestions: abstractquestion = qid_abstractquestions['test_' + str(one.qid)] test_abstractquestions.add(abstractquestion) for abstractquestion in test_abstractquestions: for ta in train_abstractquestion: testset.append([abstractquestion, ta]) write_json(testset, data_question_match + 'testset.json')
def generate_qid_abstractquestion(): # dev_2_1 = read_structure_file(dev_structure_with_2_1_grounded_graph_file) test_2_1 = read_structure_file(test_structure_with_2_1_grounded_graph_file) train_2_1 = read_structure_file( train_structure_with_2_1_grounded_graph_file) qid_abstractquestion = dict() any_2_1_dict = {'train': train_2_1, 'test': test_2_1} #'dev': dev_2_1 for key in any_2_1_dict: any_2_1 = any_2_1_dict[key] for one in any_2_1: qid = key + "_" + str(one.qid) question = one.question for ungrounded_graph in one.ungrounded_graph_forest: question_ = question for node in ungrounded_graph.nodes: if node.node_type == 'entity': question_ = question_.replace(node.friendly_name, '<e>') qid_abstractquestion[qid] = question_ break # print(len(qid_abstractquestions)) write_json(qid_abstractquestion, data_question_match + 'qid_abstractquestion.json') return qid_abstractquestion
mid_dict['answer_id'] = instance_str if isinstance(instance_str, str): # mid = 'm.02hwgbx' labels = freebase_kb_interface.get_names(instance_str) mid_dict['answer'] = list(labels) alias = freebase_kb_interface.get_alias(instance_str) mid_dict['aliases'] = list(alias) else: mid_dict['answer'] = [instance_str] mid_dict['aliases'] = [instance_str] mid_to_names_dict[instance_str] = mid_dict return mid_dict def write_cache_json(): write_json(mid_to_names_dict, fn_cwq_file.cache_mid_to_names) if __name__ == '__main__': cwq_prediction_test_json = read_json( './2020.01.11_output_cwq_IR5_all.json') for cwq_test_json in cwq_prediction_test_json: answers = [] for answer_id in cwq_test_json['answers_id']: print(cwq_test_json['ID'], '\t', answer_id) answers.append(get_names(answer_id)) cwq_test_json['answers'] = answers write_json(cwq_prediction_test_json, './2020.01.11_output_cwq_IR5_all_with_names.json') write_json(mid_to_names_dict, './cache_mid_to_names.json')
def get_top_k_grounded_graphs_by_score_standard(input_file): count_number = 0 all_f1_score = 0 correctqids_top1 = list() correctqids_top3 = list() correctqids_top5 = list() correctqids_top10 = list() for structure_path in os.listdir(input_file): count_number += 1 structure_list = read_structure_file(input_file + structure_path) totalscore_queryid_sparql = collections.defaultdict(list) grounded_graph_list = [] # gold_answer_mid_set = set() grounded_query_id_denotation = collections.defaultdict(set) # denotations_all=set() f1_1_query_id_set = set() for structure in structure_list: # gold_answers = structure.gold_answer # for gold_answer_dict in gold_answers: # gold_answer_mid_set.add(gold_answer_dict['answer_id']) for ungrounded_graph in structure.ungrounded_graph_forest: # ungrounded_graph_edges_num = len(ungrounded_graph.edges) for grounded_graph in ungrounded_graph.get_grounded_graph_forest( ): # edge constaints # grounded_graph_edges_num = len(grounded_graph.edges) # if grounded_graph_edges_num != ungrounded_graph_edges_num: continue # totalscore_queryid_sparql[grounded_graph.total_score].append(grounded_graph.grounded_query_id) totalscore_queryid_sparql[grounded_graph.score].append( grounded_graph.grounded_query_id) grounded_query_id_denotation[ grounded_graph. grounded_query_id] = grounded_graph.f1_score # totalscore_queryid_sparql[grounded_graph.total_score].append(grounded_graph.grounded_query_id) if grounded_graph.f1_score == 1.0: f1_1_query_id_set.add(grounded_graph.grounded_query_id) grounded_graph_list.append(grounded_graph) totalscore_queryid_sparql = dict( sorted(totalscore_queryid_sparql.items(), key=lambda d: d[0], reverse=True)) for totalscore, grounded_query_ids in totalscore_queryid_sparql.items( ): for grounded_query_id in grounded_query_ids: f1_score = grounded_query_id_denotation[grounded_query_id] all_f1_score += f1_score print(('%s\t%s\t%s\t%s') % (structure_path, f1_score, grounded_query_id, f1_1_query_id_set)) break break num = 0 find = False now = [] for totalscore, grounded_query_ids in totalscore_queryid_sparql.items( ): if num >= 10 or find: break for grounded_query_id in grounded_query_ids: if num >= 10 or find: break f1_score = grounded_query_id_denotation[grounded_query_id] now.append([structure_path.split('.')[0], grounded_query_id]) if f1_score == 1: find = True if num < 1: correctqids_top1.append(now) elif num < 3: correctqids_top3.append(now) elif num < 5: correctqids_top5.append(now) elif num < 10: correctqids_top10.append(now) num += 1 print('#all_f1_score:\t', all_f1_score) print('#count_number:\t', count_number) write_json(correctqids_top1, './correctqids_top1_.json') write_json(correctqids_top3, './correctqids_top3_.json') write_json(correctqids_top5, './correctqids_top5_.json') write_json(correctqids_top10, './correctqids_top10_.json')
def get_denotations_by_score_standard_prediction( input_file, q_mode='cwq', output_file='./2020.01.21_output_cwq_IR5_withnames_all_nonull_comparative.json' ): from common.hand_files import write_json assert q_mode in ['cwq', 'graphq'] if q_mode == 'cwq': from evaluation.CWQ import _01_mid_to_label_alias_names elif q_mode == 'graphq': from evaluation.GraphQuestions import _01_mid_to_label_alias_names prediction_list = [] for structure_path in os.listdir(input_file): print(structure_path) structure_list = read_structure_file(input_file + structure_path) score_to_queryid_sparql = collections.defaultdict(list) grounded_query_id_to_f1 = collections.defaultdict(set) grounded_query_id_to_recall = collections.defaultdict(set) grounded_query_id_to_precision = collections.defaultdict(set) grounded_query_id_to_denotation = collections.defaultdict(set) qid = None for structure in structure_list: qid = structure.qid for j, ungrounded_graph in enumerate( structure.ungrounded_graph_forest): if j != len(structure.ungrounded_graph_forest) - 1: continue for grounded_graph in ungrounded_graph.get_grounded_graph_forest( ): score_to_queryid_sparql[grounded_graph.score].append( grounded_graph.grounded_query_id ) #score total_score, combine_score grounded_query_id_to_denotation[ grounded_graph. grounded_query_id] = grounded_graph.denotation grounded_query_id_to_f1[ grounded_graph. grounded_query_id] = grounded_graph.f1_score grounded_query_id_to_recall[ grounded_graph. grounded_query_id] = grounded_graph.recall_score grounded_query_id_to_precision[ grounded_graph. grounded_query_id] = grounded_graph.precision_score answers_ids = [] answers = [ ] # "[{ "answer_id": "m.034tl", "answer": ["Guam"],"aliases": []},]" f1_score, recall_score, precision_score = 0, 0, 0 #第一个name or alias 非空的, 就跳出来 is_name_null = True score_to_queryid_sparql = dict( sorted(score_to_queryid_sparql.items(), key=lambda d: d[0], reverse=True)) for totalscore, grounded_query_ids in score_to_queryid_sparql.items(): for grounded_query_id in grounded_query_ids: answers_ids = grounded_query_id_to_denotation[ grounded_query_id] f1_score = grounded_query_id_to_f1[grounded_query_id] recall_score = grounded_query_id_to_recall[grounded_query_id] precision_score = grounded_query_id_to_precision[ grounded_query_id] answers = [] for answer_id in answers_ids: names_dict = _01_mid_to_label_alias_names.get_names( answer_id) if len(names_dict['answer']) > 0 or len( names_dict['aliases']) > 0: is_name_null = False answers.append(names_dict) if not is_name_null: break if not is_name_null: break q_dict = dict() q_dict['ID'] = qid q_dict['answers_id'] = answers_ids q_dict['answers'] = answers q_dict['f1_score'] = f1_score q_dict['recall_score'] = recall_score q_dict['precision_score'] = precision_score prediction_list.append(q_dict) _01_mid_to_label_alias_names.write_cache_json() write_json(prediction_list, output_file)
def get_denotations_by_score_standard_binglie( input_file, output_file='./e2e_2021.01.20_lcquad_predict_IR5_update.json'): prediction_list = [] for structure_path in os.listdir(input_file): question_normal = None question_type = None question_qid = None print(structure_path) totalscore_queryid_sparql = collections.defaultdict(list) grounded_query_id_denotation = collections.defaultdict(set) grounded_query_id_predictscore = collections.OrderedDict() grounded_query_id_f1 = collections.defaultdict(set) grounded_query_id_to_recall = collections.defaultdict(set) grounded_query_id_to_precision = collections.defaultdict(set) grounded_query_id_keypath = collections.defaultdict() for structure in read_structure_file(input_file + structure_path): question_normal = structure.question question_qid = structure.qid question_type = structure.compositionality_type for j, ungrounded_graph in enumerate( structure.ungrounded_graph_forest): if j != len(structure.ungrounded_graph_forest) - 1: continue for grounded_graph in ungrounded_graph.get_grounded_graph_forest( ): totalscore_queryid_sparql[ grounded_graph.combine_score].append( grounded_graph.grounded_query_id ) #score total_score combine_score grounded_query_id_denotation[ grounded_graph. grounded_query_id] = grounded_graph.denotation grounded_query_id_predictscore[ grounded_graph. grounded_query_id] = grounded_graph.score grounded_query_id_f1[ grounded_graph. grounded_query_id] = grounded_graph.f1_score grounded_query_id_to_recall[ grounded_graph. grounded_query_id] = grounded_graph.recall_score grounded_query_id_to_precision[ grounded_graph. grounded_query_id] = grounded_graph.precision_score grounded_query_id_keypath[ grounded_graph. grounded_query_id] = grounded_graph.key_path if question_type == 'ask': predict_denotation = [0] predict_score_list = [] for _, temp_score in grounded_query_id_predictscore.items(): predict_score_list.append(temp_score) score_list = softmax( np.array(predict_score_list )) # np.array([-6.8602, -6.8602, -8.3860, -7.4321]) max_index = score_list.argmax() # print(score_list, max_score, keypath_list[max_index]) # #[2.55135490e-05 2.49768139e-05 2.20922475e-05 7.50664797e-06, 4.22800427e-04 9.85553435e-01 1.27269936e-02] 0.9855534350037205 if score_list[max_index] >= lcquad_ask_thresold: predict_denotation = [1] q_dict = collections.OrderedDict() q_dict['ID'] = question_qid q_dict['question_normal'] = question_normal q_dict['question_type'] = question_type q_dict['answers_id'] = predict_denotation q_dict['answers'] = [] q_dict['f1_score'] = predict_denotation[0] q_dict['recall_score'] = predict_denotation[0] q_dict['precision_score'] = predict_denotation[0] prediction_list.append(q_dict) else: #bgp, count totalscore_queryid_sparql = dict( sorted(totalscore_queryid_sparql.items(), key=lambda d: d[0], reverse=True)) for totalscore, grounded_query_ids in totalscore_queryid_sparql.items( ): # 从 并列中选择一个 有并列, 选dbo system_grounded_query_id = None system_new_key_path_ = None system_predicates = None for grounded_query_id in grounded_query_ids: predicates = grounded_query_id_keypath[ grounded_query_id].split('\t') new_key_path_ = [] for predicate in predicates: new_key_path_.append(predicate.split('/')[-1]) if system_grounded_query_id is None: system_grounded_query_id = grounded_query_id system_new_key_path_ = new_key_path_ system_predicates = predicates elif system_new_key_path_ == '\t'.join( new_key_path_): #谁是dbo, 就选谁 is_system_all_dbo = True for system_predicate in system_predicates: if 'http://dbpedia.org/ontology/' not in system_predicate: is_system_all_dbo = False is_current_all_dbo = True for current_predicate in predicates: if 'http://dbpedia.org/ontology/' not in current_predicate: is_current_all_dbo = False if is_system_all_dbo: break elif is_current_all_dbo: system_grounded_query_id = grounded_query_id break q_dict = collections.OrderedDict() q_dict['ID'] = question_qid q_dict['question_normal'] = question_normal q_dict['question_type'] = question_type q_dict['answers_id'] = grounded_query_id_denotation[ system_grounded_query_id] q_dict['answers'] = [] q_dict['f1_score'] = grounded_query_id_f1[ system_grounded_query_id] q_dict['recall_score'] = grounded_query_id_to_recall[ system_grounded_query_id] q_dict['precision_score'] = grounded_query_id_to_precision[ system_grounded_query_id] prediction_list.append(q_dict) break write_json(prediction_list, pathfile=output_file)
def train_data_generation_samestructure_wq(train_qid_to_grounded_graph_dict, propertys, files, train_qid_abstractquestions, mode='cwq'): data_for_train_list = list() for i, file in enumerate(files): print(i, file) data = read_structure_file(file) qid = file.split('/')[-1].split('.')[0] if len(train_qid_abstractquestions[qid]) == 0: continue elif len(list(train_qid_abstractquestions[qid])[0]) == 0: continue # if 'WebQTrn-'+str(qid) not in train_qid_to_grounded_graph_dict: # print('do not exist: WebQTrn-'+str(qid)) # continue # gold_graph = train_qid_to_grounded_graph_dict['WebQTrn-'+str(qid)] if qid not in train_qid_to_grounded_graph_dict: print('do not exist: ' + qid) continue gold_graph = train_qid_to_grounded_graph_dict[qid] predicates = [] for edge in gold_graph.edges: predicates.append(edge.friendly_name) predicates.sort() gold_path = '\t'.join(predicates) negatives = list() j = 0 for structure in data: for ungrounded_graph in structure.ungrounded_graph_forest: for grounded_graph in ungrounded_graph.grounded_graph_forest: #path path = grounded_graph.key_path ps = path.split('\t') ps.sort() path = '\t'.join(ps) if j < model_parameters.neg_size and len(ps) == len( predicates) and path != gold_path: negatives.append(path) j += 1 if j > 0: if j < model_parameters.neg_size: while j < model_parameters.neg_size: candidate = list() for i in range(len(predicates)): candidate.append(propertys[random.randint( 0, len(propertys) - 1)]) candidate.sort() candidate = "\t".join(candidate) if candidate != gold_path \ and candidate not in negatives: negatives.append(candidate) j += 1 one = dict() one["qid"] = qid one["abstractquestion"] = list(train_qid_abstractquestions[qid])[0] one["gold_path"] = gold_path one["negatives"] = negatives data_for_train_list.append(one) else: print('not join', qid) write_json( data_for_train_list, root + '/dataset_cwq_1_1/data_path_match/data_for_trainorval_list_samestructure.json' )
return names_list if len(answers_json) > 1: answer_json = random.sample(answers_json, 1)[0] else: #1 answer_json = answers_json[0] for ans in answer_json['answer']: names_list.append(proprocess(str(ans).lower().strip())) if len(names_list) == 0: for alias in answer_json['aliases']: names_list.append(proprocess(str(alias).lower().strip())) return names_list if __name__ == '__main__': prediction_file_path = './sparqa_results/2021.03.04_output_cwq_IR_9_v0.1_wo_agg_withnames_all_nonull.json' output_file_path = './sparqa_results/2021.03.04_output_cwq_IR_9_v0.1_wo_agg_withnames_all_nonull_seed9.json' # [{"ID": "WebQTest-832_c334509bb5e02cacae1ba2e80c176499", "answer": "2012 world series"}, sparql_result_list = [] with open(prediction_file_path) as prediction_file: predictions = json.load(prediction_file) for index, prediction in enumerate(predictions): # 遍历prediction one_sparql_result = dict() one_sparql_result['ID'] = prediction['ID'] system_answer_names = get_answers_names(prediction['answers']) one_sparql_result['answer'] = system_answer_names[0] if len(system_answer_names) > 0 else "" sparql_result_list.append(one_sparql_result) write_json(sparql_result_list, output_file_path)
def generate_cwq_test_e2e_candidate_paths_from_structure( cwq_gold_path_list, test_candidates_sp_path_top_path, output_file): def get_node(grounded_graph_pattern): nodes = [] for node in grounded_graph_pattern.nodes: if node.node_type == 'entity': nodes.append({'tag': 'entity', 'uri': node.id}) elif node.node_type == 'literal': nodes.append({'tag': 'literal', 'uri': node.id}) return nodes def get_abstract_q(question_normal_, sequence_ner_tag_dict_): question_words = question_normal_.split() for key in sequence_ner_tag_dict_: if sequence_ner_tag_dict_[key] == 'entity': start, end = key.split('\t') start = int(start) end = int(end) question_words[start] = '<e>' for i in range(start + 1, end + 1): question_words[i] = '$$$' elif sequence_ner_tag_dict_[key] == 'literal': start, end = key.split('\t') start = int(start) end = int(end) question_words[start] = '<l>' for i in range(start + 1, end + 1): question_words[i] = '$$$' abstractquestion = '' for i, word in enumerate(question_words): if word != '$$$': abstractquestion += word if i < len(question_words) - 1: abstractquestion += ' ' return abstractquestion files = os.listdir(test_candidates_sp_path_top_path) new_cwq_gold_path_list = [] count = 0 for one in cwq_gold_path_list: qid = one['qid'] print(qid) if str(one['qid']) + '.json' not in files: continue new_one = dict() new_one['qid'] = one['qid'] new_one['question_normal'] = one['question_normal'] new_one['gold'] = one['gold'] question_normal = one['question_normal'] test_candidates_sp = read_structure_file( test_candidates_sp_path_top_path + str(one['qid']) + '.json') ungrounded_graph = test_candidates_sp[0].ungrounded_graph_forest[-1] grounded_graph_forest = ungrounded_graph.get_grounded_graph_forest() sequence_ner_tag_dict = eval(ungrounded_graph.sequence_ner_tag_dict) new_one['pred'] = { 'abstract_question': get_abstract_q(question_normal, sequence_ner_tag_dict), 'nodes': get_node(grounded_graph_pattern=grounded_graph_forest[0]), 'function': one['gold']['function'] } hop1, hop2, hop3, hop4 = score12_utils.grounded_graph_list_to_path_list( ungrounded_graph.get_grounded_graph_forest()) hops = [] if len(hop1) > 0: new_one['pred']['hop1'] = hop1 hops += hop1 if len(hop2) > 0: new_one['pred']['hop2'] = hop2 hops += hop2 if len(hop3) > 0: new_one['pred']['hop3'] = hop3 hops += hop3 if len(hop4) > 0: new_one['pred']['hop4'] = hop4 hops += hop4 goldpath = None for hop in hops: for i, temp_goldpath in enumerate( new_one['gold']['reverse_paths_list']): if score12_utils.eq_paths(temp_goldpath, hop): goldpath = temp_goldpath count += 1 break if goldpath is not None: new_one['gold']['path'] = goldpath del new_one['gold']['reverse_paths_list'] new_cwq_gold_path_list.append(new_one) write_json(new_cwq_gold_path_list, fn_cwq_file.score12_match + output_file) print(count)
def generate_predicate_qids(): train_qid_to_grounded_graph_dict = questions_utils.extract_grounded_graph_from_jena_freebase( train_cwq_bgp_filepath) # dev_qid_to_grounded_graph_dict = questions_utils.extract_grounded_graph_from_jena_freebase(dev_cwq_bgp_filepath) test_qid_to_grounded_graph_dict = questions_utils.extract_grounded_graph_from_jena_freebase( test_cwq_bgp_filepath) qid_abstractquestions = read_json(data_question_match + 'qid_abstractquestion.json') train_predicate_qids = collections.defaultdict(list) for qid, grounded_graph in train_qid_to_grounded_graph_dict.items(): # qid='train_'+str(qid.split('-')[1]) qid = 'train_' + qid if qid not in qid_abstractquestions: continue predicates = [] for edge in grounded_graph.edges: predicates.append(edge.friendly_name) predicates.sort() predicate = '\t'.join(predicates) # print(qid) if len(qid_abstractquestions[qid]) > 0: # print('hi',qid) # abstractquestion = qid_abstractquestions[qid] train_predicate_qids[predicate].append(qid) write_json(train_predicate_qids, data_question_match + 'train_predicate_qids.json') test_predicate_qids = collections.defaultdict(list) for qid, grounded_graph in test_qid_to_grounded_graph_dict.items(): # qid = 'test_' + str(qid.split('-')[1]) qid = 'test_' + qid if qid not in qid_abstractquestions: continue predicates = [] for edge in grounded_graph.edges: predicates.append(edge.friendly_name) predicates.sort() predicate = '\t'.join(predicates) if len(qid_abstractquestions[qid]) > 0: # abstractquestion = qid_abstractquestions[qid] test_predicate_qids[predicate].append(qid) write_json(test_predicate_qids, data_question_match + 'test_predicate_qids.json') # dev_predicate_qids = collections.defaultdict(list) # for qid, grounded_graph in dev_qid_to_grounded_graph_dict.items(): # # qid = 'dev_' + str(qid.split('-')[1]) # qid='dev_' + qid # if qid not in qid_abstractquestions: # continue # predicates = [] # for edge in grounded_graph.edges: # predicates.append(edge.friendly_name) # predicates.sort() # predicate = '\t'.join(predicates) # if len(qid_abstractquestions[qid]) > 0: # # abstractquestion = qid_abstractquestions[qid] # dev_predicate_qids[predicate].append(qid) # write_json(dev_predicate_qids, data_question_match + 'dev_predicate_qids.json') num_intersect = 0 # 2718 for predicate in test_predicate_qids: if predicate in train_predicate_qids: num_intersect += len(test_predicate_qids[predicate]) print(num_intersect)
def write_cache_json(): write_json(mid_to_names_dict, fn_graph_file.cache_mid_to_names)
def score_testquestion_bert(): def reverse(path): data = read_json(path) res = dict() for key in data: for val in data[key]: res[val] = key return res # def read_abstractquestionpair_pro(): # diction = dict() # with open(data_question_match + '09_03_cwq_test_gpu.log', 'r') as f: #'05_10_test.log' # mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) # line = mm.readline() # while line: # cols = line.decode().strip().split('\t') # abstractquestion_pair = '\t'.join([cols[0], cols[1]]) # if float(cols[3]) > 0: # diction[abstractquestion_pair] = float(cols[3]) # line = mm.readline() # mm.close() # f.close() # return def read_abstractquestionpair_pro(): diction = dict() with open(data_question_match + '09_03_cwq_test_gpu.log', 'r') as f: #'05_10_test.log' mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) line = mm.readline() while line: cols = line.decode().strip().split('\t') abstractquestion_pair = '\t'.join([cols[1], cols[2]]) if float(cols[4]) > 0: diction[abstractquestion_pair] = float(cols[4]) line = mm.readline() mm.close() f.close() return diction abstractquestionpair_pro = read_abstractquestionpair_pro() # print(abstractquestionpair_pro) testqid_trainqidmax = dict() test_qid_trainqid_pro = dict() qid_abstractquestion = read_json(data_question_match + 'qid_abstractquestion.json') test_2_1 = read_structure_file(test_structure_with_2_1_grounded_graph_file) train_2_1 = read_structure_file( train_structure_with_2_1_grounded_graph_file) test_qid_predicate = reverse(data_question_match + 'test_predicate_qids.json') train_qid_predicate = reverse(data_question_match + 'train_predicate_qids.json') for one in test_2_1: qid = 'test_' + str(one.qid) print(qid) if qid not in qid_abstractquestion: continue abstractquestion = qid_abstractquestion[qid] trainqid_pro = dict() for train_one in train_2_1: train_one_qid = 'train_' + str(train_one.qid) if train_one_qid not in qid_abstractquestion: continue train_abstractquestion = qid_abstractquestion[train_one_qid] if '\t'.join([abstractquestion, train_abstractquestion]) in abstractquestionpair_pro: # print('\t'.join([abstractquestion,train_abstractquestion])) sim = abstractquestionpair_pro[('\t'.join( [abstractquestion, train_abstractquestion]))] trainqid_pro[train_one_qid] = float(sim) trainqid_pro = dict( sorted(trainqid_pro.items(), key=lambda d: d[1], reverse=True)) if len(trainqid_pro) == 0: continue if qid in test_qid_predicate: if list(trainqid_pro.keys())[0] in train_qid_predicate: if test_qid_predicate[qid] == train_qid_predicate[list( trainqid_pro.keys())[0]]: print('yeah') test_qid_trainqid_pro[qid] = trainqid_pro if len(list(trainqid_pro.keys())) > 0: testqid_trainqidmax[qid] = list(trainqid_pro.keys())[0] write_json(test_qid_trainqid_pro, data_question_match + 'test_qid_trainqid_pro_bert') write_json(testqid_trainqidmax, data_question_match + 'testqid_trainqid_bertmax.json')