示例#1
0
def train_data_generation_samestructure_graphq(propertys, files,
                                               qid_abstractquestions):
    data_for_train_list = list()
    for i, file in enumerate(files):
        print(i, file)
        data = read_structure_file(file)
        qid = file.split('/')[-1].split('.')[0]

        if len(qid_abstractquestions[qid]) == 0:
            continue

        negatives = list()
        j = 0
        # join=True
        for structure in data:
            gold_path = []
            predicates = []
            # for edge in structure.gold_graph_query.edges:
            #     gold_path.append(edge.relation)
            #     predicates.append(edge.relation)
            for edge in structure.gold_sparql_query['edges']:
                gold_path.append(edge['relation'])
                predicates.append(edge['relation'])

            gold_path.sort()
            gold_path = '\t'.join(gold_path)
            for ungrounded_graph in structure.ungrounded_graph_forest:
                for grounded_graph in ungrounded_graph.grounded_graph_forest:
                    path = grounded_graph.key_path
                    ps = path.split('\t')
                    ps.sort()
                    path = '\t'.join(ps)
                    if j < model_parameters.neg_size and len(ps) == len(
                            predicates) and path != gold_path:
                        negatives.append(path)
                        j += 1
        if j > 0:
            if j < model_parameters.neg_size:
                while j < model_parameters.neg_size:
                    candidate = list()
                    for i in range(len(predicates)):
                        candidate.append(propertys[random.randint(
                            0,
                            len(propertys) - 1)])
                    candidate.sort()
                    candidate = "\t".join(candidate)
                    if candidate != gold_path and candidate not in negatives:
                        negatives.append(candidate)
                        j += 1
            one = dict()
            one["qid"] = qid
            one["abstractquestion"] = (qid_abstractquestions[qid])
            one["gold_path"] = gold_path
            one["negatives"] = negatives
            data_for_train_list.append(one)
        else:
            print('not join', qid)
    write_json(
        data_for_train_list, fn_graph_file.path_match_dir +
        "data_for_trainorval_list_samestructure.json")
def generate_graphq_gold_triple_path_annotation(graphq_annotation_list,
                                                graphq_gold_nodes,
                                                output_file):
    qid_graphq_nodes = {}
    for one in graphq_gold_nodes:
        qid_graphq_nodes[one['qid']] = one
    for i in range(len(graphq_annotation_list)):
        graphq_struct_one = graphq_annotation_list[i]
        qid = graphq_struct_one.qid
        print(qid)
        if qid not in qid_graphq_nodes:
            continue
        triples_ = score12_utils.get_triples_by_grounded_graph_edges_graphq(
            nodes=graphq_struct_one.nodes, edges=graphq_struct_one.edges)
        reverse_triples_list = triple_enum.get_all_reverse_triples(
            triples=triples_, property_reverse_dict=property_reverse_dict)
        reverse_paths_list = []
        for reverse_triples in reverse_triples_list:
            reverse_paths_list.append(
                score12_utils.triples_to_path_list(triples=reverse_triples,
                                                   _root_id='?x'))
        qid_graphq_nodes[qid]['gold'][
            'path'] = score12_utils.triples_to_path_list(triples=triples_,
                                                         _root_id='?x')
        qid_graphq_nodes[qid]['gold'][
            'reverse_paths_list'] = reverse_paths_list
    write_json(graphq_gold_nodes, fn_graph_file.score12_match + output_file)
def generate_trainset():
    trainset = []
    train_predicate_qids = read_json(data_question_match +
                                     'train_predicate_qids.json')
    qid_abstractquestions = read_json(data_question_match +
                                      'qid_abstractquestion.json')

    abstractquestion_all = set()
    for predicate in train_predicate_qids:
        for qid in train_predicate_qids[predicate]:
            #"train_WebQTrn-3513_7c4117891abf63781b892537979054c6",
            if qid in qid_abstractquestions:
                abstractquestion_all.add(qid_abstractquestions[qid])

    for k, predicate in enumerate(train_predicate_qids):
        print(k, predicate)

        same_abstractquestions = set()
        for qid in train_predicate_qids[predicate]:
            if qid in qid_abstractquestions:
                same_abstractquestions.add(qid_abstractquestions[qid])

        residu_abstractquestions = (list(abstractquestion_all -
                                         same_abstractquestions))
        same_abstractquestions = list(same_abstractquestions)[:10]

        for first, current in enumerate(same_abstractquestions):
            for second, gold in enumerate(same_abstractquestions):
                if current != gold:
                    random.shuffle(residu_abstractquestions)
                    neg_samples = residu_abstractquestions[:50]
                    trainset.append([current, gold, 1])
                    for neg in neg_samples:
                        trainset.append([current, neg, 0])

        # if len(same_abstractquestions)>1:
        #     current=list(same_abstractquestions)[0]
        #     gold=list(same_abstractquestions)[1]
        #     random.shuffle(residu_abstractquestions)
        #     neg_samples = residu_abstractquestions[:20]
        #     trainset.append([current,gold,1])
        #     for neg in neg_samples:
        #         trainset.append([current, neg, 0])
        #     # trainset.append([current, gold, neg_samples])
        #     current = list(same_abstractquestions)[1]
        #     gold = list(same_abstractquestions)[0]
        #     random.shuffle(residu_abstractquestions)
        #     neg_samples = residu_abstractquestions[:20]
        #     trainset.append([current, gold, 1])
        #     for neg in neg_samples:
        #         trainset.append([current, neg, 0])
        #     # trainset.append([current, gold, neg_samples])

    write_json(trainset, data_question_match + 'trainset.json')
def generate_graphq_gold_node_annotation_w_deppath(is_deppath=False):
    from method_sp.parsing import node_recognition
    train_result = []
    test_result = []
    questionnormal_function_abstract_question_nodes = dict()
    for one in annotation_node_questions_json:
        nodes = []
        for node in one['node_mention_nju']:
            if node['tag'] == 'entity' and node['uri'] is not None:
                nodes.append(node)
            elif node['tag'] == 'literal' and node['uri'] is not None:
                nodes.append(node)
        ann_dict = {
            'function': 'none',
            'abstract_question': one['abstract_question'],
            'nodes': nodes
        }
        if is_deppath:
            ungrounded_nodes = node_recognition.generate_gold_nodes(
                question_normal=one['question_normal'])
            abstract_question_deppath_list = dep_to_path.get_deppath_list(
                question_normal=one['question_normal'],
                ungrounded_nodes=ungrounded_nodes,
                isSkeletonorDep='Skeleton')
            ann_dict[
                'abstract_question_deppath'] = abstract_question_deppath_list
        questionnormal_function_abstract_question_nodes[
            one['question_normal']] = ann_dict
    for one in train_graph_questions_struct:
        new_one = {}
        new_one['qid'] = one.qid
        new_one['question_normal'] = one.question
        if one.question in questionnormal_function_abstract_question_nodes:
            new_one['gold'] = questionnormal_function_abstract_question_nodes[
                one.question]
            new_one['gold']['function'] = one.function
            train_result.append(new_one)
    for one in test_graph_questions_struct:
        new_one = {}
        new_one['qid'] = one.qid
        new_one['question_normal'] = one.question
        if new_one[
                'question_normal'] in questionnormal_function_abstract_question_nodes:
            new_one['gold'] = questionnormal_function_abstract_question_nodes[
                one.question]
            new_one['gold']['function'] = one.function
            test_result.append(new_one)
    write_json(
        train_result,
        fn_graph_file.score12_match + "train_graphq_gold_node_0124.json")
    write_json(test_result,
               fn_graph_file.score12_match + "test_graphq_gold_node_0124.json")
示例#5
0
def generate_cwq_gold_triple_path_annotation(cwq_annotation_list,
                                             qid_to_grounded_graph_dict,
                                             cwq_gold_nodes,
                                             output_file,
                                             is_deppath=False):
    from method_sp.parsing import node_recognition
    qid_graphq_nodes = {}
    for one in cwq_gold_nodes:
        qid_graphq_nodes[one['qid']] = one
    print(len(cwq_annotation_list))
    for i in range(len(cwq_annotation_list)):
        graphq_struct_one = cwq_annotation_list[i]
        qid = graphq_struct_one.ID
        if qid not in qid_graphq_nodes:
            continue
        if qid not in qid_to_grounded_graph_dict:
            continue
        print(i, qid)
        if is_deppath:
            question_normal = graphq_struct_one.question
            '''gold'''
            # ungrounded_nodes = node_recognition.generate_gold_nodes(question_normal=question_normal)
            '''system'''
            tokens = parsing_utils.create_tokens(question_normal.split(" "))
            ungrounded_nodes = node_recognition.generate_nodes(
                question_normal=question_normal, tokens=tokens)
            abstract_question_deppath_list = dep_to_path.get_deppath_list(
                question_normal=question_normal,
                ungrounded_nodes=ungrounded_nodes,
                isSkeletonorDep='Dep')
            qid_graphq_nodes[qid]['gold'][
                'abstract_question_deppath'] = abstract_question_deppath_list
        gold_grounded_graph = qid_to_grounded_graph_dict[qid]
        triples_ = score12_utils.get_triples_by_grounded_graph_edges(
            nodes=gold_grounded_graph.nodes, edges=gold_grounded_graph.edges)
        reverse_triples_list = triple_enum.get_all_reverse_triples(
            triples=triples_, property_reverse_dict=property_reverse_dict)
        reverse_paths_list = []
        for index, reverse_triples in enumerate(reverse_triples_list):
            reverse_paths_list.append(
                score12_utils.triples_to_path_list(triples=reverse_triples,
                                                   _root_id='?x'))
        qid_graphq_nodes[qid]['gold'][
            'path'] = score12_utils.triples_to_path_list(triples=triples_,
                                                         _root_id='?x')
        qid_graphq_nodes[qid]['gold'][
            'reverse_paths_list'] = reverse_paths_list
    write_json(cwq_gold_nodes, fn_cwq_file.score12_match + output_file)
def run_lcquad(data_type, output_file):
    from datasets_interface.question_interface import lcquad_1_0_interface
    ann_data_list = []
    lcquad_list = []
    if data_type == 'train':
        lcquad_list = lcquad_1_0_interface.lcquad_train_list
    elif data_type == 'test':
        lcquad_list = lcquad_1_0_interface.lcquad_test_list
    for i, lcquad_struct in enumerate(lcquad_list):
        question_normal = lcquad_struct.question_normal
        print(lcquad_struct.qid)
        entities_list = lcquad_1_0_interface.get_topic_entities_by_question(question_normal)
        abstract_question = lcquad_1_0_interface.get_abstract_question_by_question(question=question_normal)
        parsed_sparql = lcquad_struct.parsed_sparql
        sparql = lcquad_struct.sparql
        gold_triples = ir_online_utils.get_triples_by_sparql_json(parsed_sparql)
        gold_path = ir_online_utils.convert_triples_to_path(triples=gold_triples)
        gold = {}
        gold['question_type'] = ir_online_utils.get_question_type_by_sparql_json(sparql_json=parsed_sparql)
        gold['topic_entities'] = entities_list
        gold['aggregation_function'] = ir_online_utils.get_aggregation_function_by_sparql_json(sparq_json=parsed_sparql)
        gold['type_constraints'] = ir_online_utils.get_type_constraints_by_sparql_json(sparql_json=parsed_sparql)
        gold['gold_path'] = gold_path
        gold['gold_triples'] = gold_triples
        gold['sparql'] = sparql
        data = {
            'qid': lcquad_struct.qid,
            'question_normal': lcquad_struct.question_normal,
            'abstract_question': abstract_question,
            'gold': gold,
            'no_positive_path': True,
            'hop1': [],
            'hop2': []
        }
        topic_entities_with_types = ir_online_utils.topic_entities_with_t(entities_list=entities_list)
        # hop1, hop2 = _get_hop1_hop2_by_enum_grounded_graphs(topic_entities_with_types)
        try:
            hop1, hop2 = _get_hop1_hop2_by_online(topic_entities_with_types=topic_entities_with_types)
        except Exception as e:
            hop1, hop2 = [], []
            print('Error:\t', question_normal)
        data['hop1'] = hop1
        data['hop2'] = hop2
        data['no_positive_path'] = ir_online_utils.is_exist_gold_path(hop_list=hop1 + hop2, gold_path=gold_path)
        ann_data_list.append(data)
    write_json(ann_data_list, output_file)
示例#7
0
def grounded_graphes_by_score_standard_ywsun_prediction_test(input_file):
    from common.hand_files import write_json
    all_structure_path = os.listdir(input_file)
    # all_f1_score = 0
    prediction_list = []
    for structure_path in all_structure_path:
        print(structure_path)
        structure_list = read_structure_file(input_file + structure_path)
        score_to_queryid_sparql = collections.defaultdict(list)
        # grounded_query_id_to_f1_denotation = collections.defaultdict(set)
        grounded_query_id_to_denotation = collections.defaultdict(set)
        qid = None
        for structure in structure_list:
            qid = structure.qid
            for ungrounded_graph in structure.ungrounded_graph_forest:
                # ungrounded_graph_edges_num = len(ungrounded_graph.edges)
                for grounded_graph in ungrounded_graph.get_grounded_graph_forest(
                ):
                    # grounded_graph_edges_num = len(grounded_graph.edges)
                    # edge constaints
                    # if grounded_graph_edges_num != ungrounded_graph_edges_num: continue
                    # score_to_queryid_sparql[grounded_graph.score].append(grounded_graph.grounded_query_id) #word level matcher
                    score_to_queryid_sparql[grounded_graph.total_score].append(
                        grounded_graph.grounded_query_id)
                    # grounded_query_id_to_f1_denotation[grounded_graph.grounded_query_id] = grounded_graph.f1_score
                    grounded_query_id_to_denotation[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.denotation
        answers = []
        score_to_queryid_sparql = dict(
            sorted(score_to_queryid_sparql.items(),
                   key=lambda d: d[0],
                   reverse=True))
        for totalscore, grounded_query_ids in score_to_queryid_sparql.items():
            for grounded_query_id in grounded_query_ids:
                answers = grounded_query_id_to_denotation[grounded_query_id]
                # all_f1_score += f1_score
                # top1id = grounded_query_id
                break
            break
        q_dict = dict()
        q_dict['ID'] = qid
        q_dict['answers_id'] = answers
        prediction_list.append(q_dict)
    write_json(prediction_list,
               './20191113_cwq_wo_wordlevel_prediction_test.json')
def run_cwq(data_type, output_file):
    from datasets_interface.question_interface import complexwebquestion_interface
    ann_data_list = []
    complexwebq_struct_list = []
    if data_type == 'train':
        complexwebq_struct_list = complexwebquestion_interface.complexwebq_train_list
    elif data_type == 'test':
        complexwebq_struct_list = complexwebquestion_interface.complexwebq_test_list
    elif data_type == 'dev':
        complexwebq_struct_list = complexwebquestion_interface.complexwebq_dev_list
    for i, complexwebq_struct in enumerate(complexwebq_struct_list):
        question_normal = complexwebq_struct.question
        print(complexwebq_struct.ID)
        entities_list = complexwebquestion_interface.get_topic_entities_by_question(question_normal)
        abstract_question = complexwebquestion_interface.get_abstract_question_by_question(question=question_normal)
        parsed_sparql = complexwebq_struct.parsed_sparql
        sparql = complexwebq_struct.sparql
        gold_triples = ir_online_utils.get_triples_by_sparql_json(parsed_sparql)
        gold_path = ir_online_utils.convert_triples_to_path(triples=gold_triples)
        gold = {}
        gold['question_type'] = ir_online_utils.get_question_type_by_sparql_json(sparql_json=parsed_sparql)
        gold['topic_entities'] = entities_list
        gold['aggregation_function'] = ir_online_utils.get_aggregation_function_by_sparql_json(sparq_json=parsed_sparql)
        gold['type_constraints'] = ir_online_utils.get_type_constraints_by_sparql_json(sparql_json=parsed_sparql)
        gold['gold_path'] = gold_path
        gold['gold_triples'] = gold_triples
        gold['sparql'] = sparql
        data = {
            'qid': complexwebq_struct.ID,
            'question_normal': question_normal,
            'abstract_question': abstract_question,
            'gold': gold,
            'no_positive_path': True,
            'hop1': [],
            'hop2': []
        }
        topic_entities_with_types = ir_online_utils.topic_entities_with_t(entities_list=entities_list)
        # hop1, hop2 = _get_hop1_hop2_by_enum_grounded_graphs(topic_entities_with_types)
        hop1, hop2 = _get_hop1_hop2_by_online(topic_entities_with_types=topic_entities_with_types)
        data['hop1'] = hop1
        data['hop2'] = hop2
        data['no_positive_path'] = ir_online_utils.is_exist_gold_path(hop_list=hop1 + hop2, gold_path=gold_path)
        ann_data_list.append(data)
        break
    write_json(ann_data_list, output_file)
示例#9
0
def generate_cwq_train_candidates_paths_from_structure(
        cwq_gold_path_list, train_candidates_sp_path_top_path, output_file):
    files = os.listdir(train_candidates_sp_path_top_path)
    new_cwq_path_list = []
    for one in cwq_gold_path_list:
        print(one['qid'])
        if str(one['qid']) + '.json' not in files:
            continue
        if 'path' not in one['gold']:
            continue
        new_one = dict()
        new_one['qid'] = one['qid']
        new_one['question_normal'] = one['question_normal']
        new_one['gold'] = one['gold']
        test_candidates_sp = read_structure_file(
            train_candidates_sp_path_top_path + str(one['qid']) + '.json')
        test_candidates_sp = test_candidates_sp[0]
        ungrounded_graph = test_candidates_sp.ungrounded_graph_forest[-1]
        hop1, hop2, hop3, hop4 = score12_utils.grounded_graph_list_to_path_list(
            ungrounded_graph.get_grounded_graph_forest())
        hops = []
        if len(hop1) > 0:
            new_one['gold']['hop1'] = hop1
            hops += hop1
        if len(hop2) > 0:
            new_one['gold']['hop2'] = hop2
            hops += hop2
        if len(hop3) > 0:
            new_one['gold']['hop3'] = hop3
            hops += hop3
        if len(hop4) > 0:
            new_one['gold']['hop4'] = hop4
            hops += hop4
        goldpath = None
        for hop in hops:
            for i, temp_goldpath in enumerate(
                    new_one['gold']['reverse_paths_list']):
                if score12_utils.eq_paths(temp_goldpath, hop):
                    goldpath = temp_goldpath
                    break
        if goldpath is not None:
            new_one['gold']['path'] = goldpath
        del new_one['gold']['reverse_paths_list']
        new_cwq_path_list.append(new_one)
    write_json(new_cwq_path_list, fn_cwq_file.score12_match + output_file)
示例#10
0
def show_f1_given_qids(input_file, qids):
    qid_f1 = dict()
    all_data_path = os.listdir(input_file)
    for path in all_data_path:
        if path.split('.')[0] in qids:
            structure_with_grounded_graphq_file = input_file + path
            structure_list = read_structure_file(
                structure_with_grounded_graphq_file)
            print(path)
            max_f1 = 0
            for structure in structure_list:
                for ungrounded_graph in structure.ungrounded_graph_forest:
                    for grounded_graph in ungrounded_graph.get_grounded_graph_forest(
                    ):
                        if max_f1 < grounded_graph.f1_score:
                            max_f1 = grounded_graph.f1_score
            qid_f1[path.split('.')[0]] = max_f1
    write_json(qid_f1, 'qid_f1.json')
def run_graphquestions(data_type, output_file):
    from datasets_interface.question_interface import graphquestion_interface
    ann_data_list = []
    graphq_struct_list = []
    if data_type == 'train':
        graphq_struct_list = graphquestion_interface.train_graph_questions_struct
    elif data_type == 'test':
        graphq_struct_list = graphquestion_interface.test_graph_questions_struct
    for i, graphq_struct in enumerate(graphq_struct_list):
        question_normal = graphq_struct.question
        print(graphq_struct.qid)
        entities_list = graphquestion_interface.get_topic_entities_by_question(question_normal)
        abstract_question = graphquestion_interface.get_abstract_question_by_question(question=question_normal)
        parsed_sparql = graphq_struct.parsed_sparql
        sparql = graphq_struct.sparql_query
        # gold_triples = generate_utils.get_triples_by_sparql_json(parsed_sparql)
        gold_triples = ir_online_utils.get_triples_by_grounded_graph_edges(nodes=graphq_struct.nodes, edges=graphq_struct.edges)
        gold_path = ir_online_utils.convert_triples_to_path(triples=gold_triples)
        gold = {}
        gold['question_type'] = ir_online_utils.get_question_type_by_sparql_json(sparql_json=parsed_sparql)
        gold['topic_entities'] = entities_list
        gold['aggregation_function'] = ir_online_utils.get_aggregation_function_by_sparql_json(sparq_json=parsed_sparql)
        gold['type_constraints'] = ir_online_utils.get_type_constraints_by_sparql_json(sparql_json=parsed_sparql)
        gold['gold_path'] = gold_path
        gold['gold_triples'] = gold_triples
        gold['sparql'] = sparql
        data = {
            'qid': graphq_struct.qid,
            'question_normal': graphq_struct.question,
            'abstract_question': abstract_question,
            'gold': gold,
            'no_positive_path': True,
            'hop1': [],
            'hop2': []
        }
        topic_entities_with_types = ir_online_utils.topic_entities_with_t(entities_list=entities_list)
        # hop1, hop2 = _get_hop1_hop2_by_enum_grounded_graphs(topic_entities_with_types)
        hop1, hop2 = _get_hop1_hop2_by_online(topic_entities_with_types)
        data['hop1'] = hop1
        data['hop2'] = hop2
        data['no_positive_path'] = ir_online_utils.is_exist_gold_path(hop_list=hop1 + hop2, gold_path=gold_path)
        ann_data_list.append(data)
    write_json(ann_data_list, output_file)
def investigate_denotation_same():

    testqid_trainqid_bertmax = read_json(data_question_match +
                                         'testqid_trainqid_bertmax.json')
    qmi = QuestionMatchInterface()
    structure_2_2_files = '/2.2_test_span_transfer_wo_wordlevel/'
    all_data_path = os.listdir(output_path + structure_2_2_files)
    for path in all_data_path:
        print(path)
        test_qid = path.split('.')[0]
        test_qid = 'test_' + str(test_qid)
        # if 'test_'+str(test_qid) not in testqid_trainqid_bertmax:
        if test_qid not in testqid_trainqid_bertmax:
            continue
        # structure_with_grounded_graphq_file = output_path + structure_2_2_files + path
        structure_list = read_structure_file(output_path +
                                             structure_2_2_files + path)
        for structure in structure_list:
            for ungrounded_graph in structure.ungrounded_graph_forest:
                nodes = []
                for groundedgraph in ungrounded_graph.get_grounded_graph_forest(
                ):
                    nodes = groundedgraph.nodes
                    break
                # print(test_qid)
                # denotation = set(qmi.get_denotation_by_testqid_nodes(test_qid, nodes))
                denotation = set(
                    qmi.get_denotation_by_testqid_nodes_freebase(
                        test_qid, nodes))
                print('denotations:', denotation)
                # gold_mids = set()
                # for one in structure.gold_answer:
                #     gold_mids.add(one['answer_id'])
                #
                # if  (len(denotation-gold_mids)==0 and len(gold_mids-denotation)==0):
                #     print('oh no',test_qid)
                #     if test_qid in qmunique_qids:
                #         print('double oh no')
    write_json(
        qmi.testqid_correspondingtrainqid_denotations,
        data_question_match + 'testqid_correspondingtrainqid_denotations.json')
def generate_testset():
    testset = []
    test_2_1 = read_structure_file(test_structure_with_2_1_grounded_graph_file)
    train_predicate_qids = read_json(data_question_match +
                                     'train_predicate_qids.json')
    qid_abstractquestions = read_json(data_question_match +
                                      'qid_abstractquestion.json')
    train_abstractquestion = set()
    for predicate in train_predicate_qids:
        for qid in train_predicate_qids[predicate]:
            if qid in qid_abstractquestions:
                train_abstractquestion.add(qid_abstractquestions[qid])
    test_abstractquestions = set()
    for one in test_2_1:
        if 'test_' + str(one.qid) in qid_abstractquestions:
            abstractquestion = qid_abstractquestions['test_' + str(one.qid)]
            test_abstractquestions.add(abstractquestion)
    for abstractquestion in test_abstractquestions:
        for ta in train_abstractquestion:
            testset.append([abstractquestion, ta])
    write_json(testset, data_question_match + 'testset.json')
def generate_qid_abstractquestion():
    # dev_2_1 = read_structure_file(dev_structure_with_2_1_grounded_graph_file)
    test_2_1 = read_structure_file(test_structure_with_2_1_grounded_graph_file)
    train_2_1 = read_structure_file(
        train_structure_with_2_1_grounded_graph_file)
    qid_abstractquestion = dict()
    any_2_1_dict = {'train': train_2_1, 'test': test_2_1}  #'dev': dev_2_1
    for key in any_2_1_dict:
        any_2_1 = any_2_1_dict[key]
        for one in any_2_1:
            qid = key + "_" + str(one.qid)
            question = one.question
            for ungrounded_graph in one.ungrounded_graph_forest:
                question_ = question
                for node in ungrounded_graph.nodes:
                    if node.node_type == 'entity':
                        question_ = question_.replace(node.friendly_name,
                                                      '<e>')
                qid_abstractquestion[qid] = question_
                break
    # print(len(qid_abstractquestions))
    write_json(qid_abstractquestion,
               data_question_match + 'qid_abstractquestion.json')
    return qid_abstractquestion
        mid_dict['answer_id'] = instance_str
        if isinstance(instance_str, str):  # mid = 'm.02hwgbx'
            labels = freebase_kb_interface.get_names(instance_str)
            mid_dict['answer'] = list(labels)
            alias = freebase_kb_interface.get_alias(instance_str)
            mid_dict['aliases'] = list(alias)
        else:
            mid_dict['answer'] = [instance_str]
            mid_dict['aliases'] = [instance_str]
        mid_to_names_dict[instance_str] = mid_dict
    return mid_dict


def write_cache_json():
    write_json(mid_to_names_dict, fn_cwq_file.cache_mid_to_names)


if __name__ == '__main__':
    cwq_prediction_test_json = read_json(
        './2020.01.11_output_cwq_IR5_all.json')
    for cwq_test_json in cwq_prediction_test_json:
        answers = []
        for answer_id in cwq_test_json['answers_id']:
            print(cwq_test_json['ID'], '\t', answer_id)
            answers.append(get_names(answer_id))
        cwq_test_json['answers'] = answers

    write_json(cwq_prediction_test_json,
               './2020.01.11_output_cwq_IR5_all_with_names.json')
    write_json(mid_to_names_dict, './cache_mid_to_names.json')
示例#16
0
def get_top_k_grounded_graphs_by_score_standard(input_file):
    count_number = 0
    all_f1_score = 0
    correctqids_top1 = list()
    correctqids_top3 = list()
    correctqids_top5 = list()
    correctqids_top10 = list()
    for structure_path in os.listdir(input_file):
        count_number += 1
        structure_list = read_structure_file(input_file + structure_path)

        totalscore_queryid_sparql = collections.defaultdict(list)
        grounded_graph_list = []
        # gold_answer_mid_set = set()
        grounded_query_id_denotation = collections.defaultdict(set)
        # denotations_all=set()

        f1_1_query_id_set = set()
        for structure in structure_list:
            # gold_answers = structure.gold_answer
            # for gold_answer_dict in gold_answers:
            #     gold_answer_mid_set.add(gold_answer_dict['answer_id'])
            for ungrounded_graph in structure.ungrounded_graph_forest:
                # ungrounded_graph_edges_num = len(ungrounded_graph.edges)
                for grounded_graph in ungrounded_graph.get_grounded_graph_forest(
                ):
                    # edge constaints
                    # grounded_graph_edges_num = len(grounded_graph.edges)
                    # if grounded_graph_edges_num != ungrounded_graph_edges_num: continue
                    # totalscore_queryid_sparql[grounded_graph.total_score].append(grounded_graph.grounded_query_id)
                    totalscore_queryid_sparql[grounded_graph.score].append(
                        grounded_graph.grounded_query_id)
                    grounded_query_id_denotation[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.f1_score
                    # totalscore_queryid_sparql[grounded_graph.total_score].append(grounded_graph.grounded_query_id)

                    if grounded_graph.f1_score == 1.0:
                        f1_1_query_id_set.add(grounded_graph.grounded_query_id)

                    grounded_graph_list.append(grounded_graph)

        totalscore_queryid_sparql = dict(
            sorted(totalscore_queryid_sparql.items(),
                   key=lambda d: d[0],
                   reverse=True))
        for totalscore, grounded_query_ids in totalscore_queryid_sparql.items(
        ):
            for grounded_query_id in grounded_query_ids:
                f1_score = grounded_query_id_denotation[grounded_query_id]
                all_f1_score += f1_score
                print(('%s\t%s\t%s\t%s') %
                      (structure_path, f1_score, grounded_query_id,
                       f1_1_query_id_set))
                break
            break

        num = 0
        find = False
        now = []
        for totalscore, grounded_query_ids in totalscore_queryid_sparql.items(
        ):
            if num >= 10 or find:
                break

            for grounded_query_id in grounded_query_ids:
                if num >= 10 or find:
                    break
                f1_score = grounded_query_id_denotation[grounded_query_id]
                now.append([structure_path.split('.')[0], grounded_query_id])
                if f1_score == 1:
                    find = True
                    if num < 1:
                        correctqids_top1.append(now)
                    elif num < 3:
                        correctqids_top3.append(now)
                    elif num < 5:
                        correctqids_top5.append(now)
                    elif num < 10:
                        correctqids_top10.append(now)
                num += 1

    print('#all_f1_score:\t', all_f1_score)
    print('#count_number:\t', count_number)
    write_json(correctqids_top1, './correctqids_top1_.json')
    write_json(correctqids_top3, './correctqids_top3_.json')
    write_json(correctqids_top5, './correctqids_top5_.json')
    write_json(correctqids_top10, './correctqids_top10_.json')
def get_denotations_by_score_standard_prediction(
    input_file,
    q_mode='cwq',
    output_file='./2020.01.21_output_cwq_IR5_withnames_all_nonull_comparative.json'
):
    from common.hand_files import write_json
    assert q_mode in ['cwq', 'graphq']
    if q_mode == 'cwq':
        from evaluation.CWQ import _01_mid_to_label_alias_names
    elif q_mode == 'graphq':
        from evaluation.GraphQuestions import _01_mid_to_label_alias_names
    prediction_list = []
    for structure_path in os.listdir(input_file):
        print(structure_path)
        structure_list = read_structure_file(input_file + structure_path)
        score_to_queryid_sparql = collections.defaultdict(list)
        grounded_query_id_to_f1 = collections.defaultdict(set)
        grounded_query_id_to_recall = collections.defaultdict(set)
        grounded_query_id_to_precision = collections.defaultdict(set)
        grounded_query_id_to_denotation = collections.defaultdict(set)
        qid = None
        for structure in structure_list:
            qid = structure.qid
            for j, ungrounded_graph in enumerate(
                    structure.ungrounded_graph_forest):
                if j != len(structure.ungrounded_graph_forest) - 1:
                    continue
                for grounded_graph in ungrounded_graph.get_grounded_graph_forest(
                ):
                    score_to_queryid_sparql[grounded_graph.score].append(
                        grounded_graph.grounded_query_id
                    )  #score total_score, combine_score
                    grounded_query_id_to_denotation[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.denotation
                    grounded_query_id_to_f1[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.f1_score
                    grounded_query_id_to_recall[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.recall_score
                    grounded_query_id_to_precision[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.precision_score
        answers_ids = []
        answers = [
        ]  # "[{ "answer_id": "m.034tl", "answer": ["Guam"],"aliases": []},]"
        f1_score, recall_score, precision_score = 0, 0, 0
        #第一个name or alias 非空的, 就跳出来
        is_name_null = True
        score_to_queryid_sparql = dict(
            sorted(score_to_queryid_sparql.items(),
                   key=lambda d: d[0],
                   reverse=True))
        for totalscore, grounded_query_ids in score_to_queryid_sparql.items():
            for grounded_query_id in grounded_query_ids:
                answers_ids = grounded_query_id_to_denotation[
                    grounded_query_id]
                f1_score = grounded_query_id_to_f1[grounded_query_id]
                recall_score = grounded_query_id_to_recall[grounded_query_id]
                precision_score = grounded_query_id_to_precision[
                    grounded_query_id]
                answers = []
                for answer_id in answers_ids:
                    names_dict = _01_mid_to_label_alias_names.get_names(
                        answer_id)
                    if len(names_dict['answer']) > 0 or len(
                            names_dict['aliases']) > 0:
                        is_name_null = False
                        answers.append(names_dict)
                if not is_name_null:
                    break
            if not is_name_null:
                break
        q_dict = dict()
        q_dict['ID'] = qid
        q_dict['answers_id'] = answers_ids
        q_dict['answers'] = answers
        q_dict['f1_score'] = f1_score
        q_dict['recall_score'] = recall_score
        q_dict['precision_score'] = precision_score
        prediction_list.append(q_dict)
    _01_mid_to_label_alias_names.write_cache_json()
    write_json(prediction_list, output_file)
def get_denotations_by_score_standard_binglie(
        input_file,
        output_file='./e2e_2021.01.20_lcquad_predict_IR5_update.json'):
    prediction_list = []
    for structure_path in os.listdir(input_file):
        question_normal = None
        question_type = None
        question_qid = None
        print(structure_path)
        totalscore_queryid_sparql = collections.defaultdict(list)
        grounded_query_id_denotation = collections.defaultdict(set)
        grounded_query_id_predictscore = collections.OrderedDict()
        grounded_query_id_f1 = collections.defaultdict(set)
        grounded_query_id_to_recall = collections.defaultdict(set)
        grounded_query_id_to_precision = collections.defaultdict(set)
        grounded_query_id_keypath = collections.defaultdict()
        for structure in read_structure_file(input_file + structure_path):
            question_normal = structure.question
            question_qid = structure.qid
            question_type = structure.compositionality_type

            for j, ungrounded_graph in enumerate(
                    structure.ungrounded_graph_forest):
                if j != len(structure.ungrounded_graph_forest) - 1:
                    continue
                for grounded_graph in ungrounded_graph.get_grounded_graph_forest(
                ):
                    totalscore_queryid_sparql[
                        grounded_graph.combine_score].append(
                            grounded_graph.grounded_query_id
                        )  #score  total_score  combine_score
                    grounded_query_id_denotation[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.denotation
                    grounded_query_id_predictscore[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.score
                    grounded_query_id_f1[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.f1_score
                    grounded_query_id_to_recall[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.recall_score
                    grounded_query_id_to_precision[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.precision_score
                    grounded_query_id_keypath[
                        grounded_graph.
                        grounded_query_id] = grounded_graph.key_path

        if question_type == 'ask':
            predict_denotation = [0]
            predict_score_list = []
            for _, temp_score in grounded_query_id_predictscore.items():
                predict_score_list.append(temp_score)
            score_list = softmax(
                np.array(predict_score_list
                         ))  # np.array([-6.8602, -6.8602, -8.3860, -7.4321])
            max_index = score_list.argmax()
            # print(score_list, max_score, keypath_list[max_index])
            # #[2.55135490e-05 2.49768139e-05 2.20922475e-05 7.50664797e-06, 4.22800427e-04 9.85553435e-01 1.27269936e-02] 0.9855534350037205
            if score_list[max_index] >= lcquad_ask_thresold:
                predict_denotation = [1]
            q_dict = collections.OrderedDict()
            q_dict['ID'] = question_qid
            q_dict['question_normal'] = question_normal
            q_dict['question_type'] = question_type
            q_dict['answers_id'] = predict_denotation
            q_dict['answers'] = []
            q_dict['f1_score'] = predict_denotation[0]
            q_dict['recall_score'] = predict_denotation[0]
            q_dict['precision_score'] = predict_denotation[0]
            prediction_list.append(q_dict)

        else:  #bgp, count
            totalscore_queryid_sparql = dict(
                sorted(totalscore_queryid_sparql.items(),
                       key=lambda d: d[0],
                       reverse=True))
            for totalscore, grounded_query_ids in totalscore_queryid_sparql.items(
            ):
                # 从 并列中选择一个 有并列, 选dbo
                system_grounded_query_id = None
                system_new_key_path_ = None
                system_predicates = None
                for grounded_query_id in grounded_query_ids:
                    predicates = grounded_query_id_keypath[
                        grounded_query_id].split('\t')
                    new_key_path_ = []
                    for predicate in predicates:
                        new_key_path_.append(predicate.split('/')[-1])
                    if system_grounded_query_id is None:
                        system_grounded_query_id = grounded_query_id
                        system_new_key_path_ = new_key_path_
                        system_predicates = predicates
                    elif system_new_key_path_ == '\t'.join(
                            new_key_path_):  #谁是dbo, 就选谁
                        is_system_all_dbo = True
                        for system_predicate in system_predicates:
                            if 'http://dbpedia.org/ontology/' not in system_predicate:
                                is_system_all_dbo = False
                        is_current_all_dbo = True
                        for current_predicate in predicates:
                            if 'http://dbpedia.org/ontology/' not in current_predicate:
                                is_current_all_dbo = False
                        if is_system_all_dbo:
                            break
                        elif is_current_all_dbo:
                            system_grounded_query_id = grounded_query_id
                            break
                q_dict = collections.OrderedDict()
                q_dict['ID'] = question_qid
                q_dict['question_normal'] = question_normal
                q_dict['question_type'] = question_type
                q_dict['answers_id'] = grounded_query_id_denotation[
                    system_grounded_query_id]
                q_dict['answers'] = []
                q_dict['f1_score'] = grounded_query_id_f1[
                    system_grounded_query_id]
                q_dict['recall_score'] = grounded_query_id_to_recall[
                    system_grounded_query_id]
                q_dict['precision_score'] = grounded_query_id_to_precision[
                    system_grounded_query_id]
                prediction_list.append(q_dict)
                break

    write_json(prediction_list, pathfile=output_file)
示例#19
0
def train_data_generation_samestructure_wq(train_qid_to_grounded_graph_dict,
                                           propertys,
                                           files,
                                           train_qid_abstractquestions,
                                           mode='cwq'):
    data_for_train_list = list()
    for i, file in enumerate(files):
        print(i, file)
        data = read_structure_file(file)
        qid = file.split('/')[-1].split('.')[0]
        if len(train_qid_abstractquestions[qid]) == 0:
            continue
        elif len(list(train_qid_abstractquestions[qid])[0]) == 0:
            continue
        # if 'WebQTrn-'+str(qid) not in train_qid_to_grounded_graph_dict:
        #     print('do not exist: WebQTrn-'+str(qid))
        #     continue
        # gold_graph = train_qid_to_grounded_graph_dict['WebQTrn-'+str(qid)]
        if qid not in train_qid_to_grounded_graph_dict:
            print('do not exist: ' + qid)
            continue
        gold_graph = train_qid_to_grounded_graph_dict[qid]
        predicates = []
        for edge in gold_graph.edges:
            predicates.append(edge.friendly_name)
        predicates.sort()
        gold_path = '\t'.join(predicates)
        negatives = list()
        j = 0
        for structure in data:
            for ungrounded_graph in structure.ungrounded_graph_forest:
                for grounded_graph in ungrounded_graph.grounded_graph_forest:
                    #path
                    path = grounded_graph.key_path
                    ps = path.split('\t')
                    ps.sort()
                    path = '\t'.join(ps)
                    if j < model_parameters.neg_size and len(ps) == len(
                            predicates) and path != gold_path:
                        negatives.append(path)
                        j += 1
        if j > 0:
            if j < model_parameters.neg_size:
                while j < model_parameters.neg_size:
                    candidate = list()
                    for i in range(len(predicates)):
                        candidate.append(propertys[random.randint(
                            0,
                            len(propertys) - 1)])
                    candidate.sort()
                    candidate = "\t".join(candidate)
                    if candidate != gold_path \
                            and candidate not in negatives:
                        negatives.append(candidate)
                        j += 1
            one = dict()
            one["qid"] = qid
            one["abstractquestion"] = list(train_qid_abstractquestions[qid])[0]
            one["gold_path"] = gold_path
            one["negatives"] = negatives
            data_for_train_list.append(one)
        else:
            print('not join', qid)
    write_json(
        data_for_train_list, root +
        '/dataset_cwq_1_1/data_path_match/data_for_trainorval_list_samestructure.json'
    )
        return names_list
    if len(answers_json) > 1:
        answer_json = random.sample(answers_json, 1)[0]
    else: #1
        answer_json = answers_json[0]

    for ans in answer_json['answer']:
        names_list.append(proprocess(str(ans).lower().strip()))

    if len(names_list) == 0:
        for alias in answer_json['aliases']:
            names_list.append(proprocess(str(alias).lower().strip()))
    return names_list


if __name__ == '__main__':
    prediction_file_path = './sparqa_results/2021.03.04_output_cwq_IR_9_v0.1_wo_agg_withnames_all_nonull.json'
    output_file_path = './sparqa_results/2021.03.04_output_cwq_IR_9_v0.1_wo_agg_withnames_all_nonull_seed9.json'
    # [{"ID": "WebQTest-832_c334509bb5e02cacae1ba2e80c176499", "answer": "2012 world series"},
    sparql_result_list = []
    with open(prediction_file_path) as prediction_file:
        predictions = json.load(prediction_file)
        for index, prediction in enumerate(predictions):  # 遍历prediction
            one_sparql_result = dict()
            one_sparql_result['ID'] = prediction['ID']
            system_answer_names = get_answers_names(prediction['answers'])
            one_sparql_result['answer'] = system_answer_names[0] if len(system_answer_names) > 0 else ""
            sparql_result_list.append(one_sparql_result)

    write_json(sparql_result_list, output_file_path)
示例#21
0
def generate_cwq_test_e2e_candidate_paths_from_structure(
        cwq_gold_path_list, test_candidates_sp_path_top_path, output_file):
    def get_node(grounded_graph_pattern):
        nodes = []
        for node in grounded_graph_pattern.nodes:
            if node.node_type == 'entity':
                nodes.append({'tag': 'entity', 'uri': node.id})
            elif node.node_type == 'literal':
                nodes.append({'tag': 'literal', 'uri': node.id})
        return nodes

    def get_abstract_q(question_normal_, sequence_ner_tag_dict_):
        question_words = question_normal_.split()
        for key in sequence_ner_tag_dict_:
            if sequence_ner_tag_dict_[key] == 'entity':
                start, end = key.split('\t')
                start = int(start)
                end = int(end)
                question_words[start] = '<e>'
                for i in range(start + 1, end + 1):
                    question_words[i] = '$$$'
            elif sequence_ner_tag_dict_[key] == 'literal':
                start, end = key.split('\t')
                start = int(start)
                end = int(end)
                question_words[start] = '<l>'
                for i in range(start + 1, end + 1):
                    question_words[i] = '$$$'
        abstractquestion = ''
        for i, word in enumerate(question_words):
            if word != '$$$':
                abstractquestion += word
                if i < len(question_words) - 1:
                    abstractquestion += ' '
        return abstractquestion

    files = os.listdir(test_candidates_sp_path_top_path)
    new_cwq_gold_path_list = []
    count = 0
    for one in cwq_gold_path_list:
        qid = one['qid']
        print(qid)
        if str(one['qid']) + '.json' not in files:
            continue
        new_one = dict()
        new_one['qid'] = one['qid']
        new_one['question_normal'] = one['question_normal']
        new_one['gold'] = one['gold']
        question_normal = one['question_normal']
        test_candidates_sp = read_structure_file(
            test_candidates_sp_path_top_path + str(one['qid']) + '.json')
        ungrounded_graph = test_candidates_sp[0].ungrounded_graph_forest[-1]
        grounded_graph_forest = ungrounded_graph.get_grounded_graph_forest()
        sequence_ner_tag_dict = eval(ungrounded_graph.sequence_ner_tag_dict)
        new_one['pred'] = {
            'abstract_question':
            get_abstract_q(question_normal, sequence_ner_tag_dict),
            'nodes':
            get_node(grounded_graph_pattern=grounded_graph_forest[0]),
            'function':
            one['gold']['function']
        }
        hop1, hop2, hop3, hop4 = score12_utils.grounded_graph_list_to_path_list(
            ungrounded_graph.get_grounded_graph_forest())
        hops = []
        if len(hop1) > 0:
            new_one['pred']['hop1'] = hop1
            hops += hop1
        if len(hop2) > 0:
            new_one['pred']['hop2'] = hop2
            hops += hop2
        if len(hop3) > 0:
            new_one['pred']['hop3'] = hop3
            hops += hop3
        if len(hop4) > 0:
            new_one['pred']['hop4'] = hop4
            hops += hop4
        goldpath = None
        for hop in hops:
            for i, temp_goldpath in enumerate(
                    new_one['gold']['reverse_paths_list']):
                if score12_utils.eq_paths(temp_goldpath, hop):
                    goldpath = temp_goldpath
                    count += 1
                    break
        if goldpath is not None:
            new_one['gold']['path'] = goldpath
        del new_one['gold']['reverse_paths_list']
        new_cwq_gold_path_list.append(new_one)
    write_json(new_cwq_gold_path_list, fn_cwq_file.score12_match + output_file)
    print(count)
def generate_predicate_qids():
    train_qid_to_grounded_graph_dict = questions_utils.extract_grounded_graph_from_jena_freebase(
        train_cwq_bgp_filepath)
    # dev_qid_to_grounded_graph_dict = questions_utils.extract_grounded_graph_from_jena_freebase(dev_cwq_bgp_filepath)
    test_qid_to_grounded_graph_dict = questions_utils.extract_grounded_graph_from_jena_freebase(
        test_cwq_bgp_filepath)
    qid_abstractquestions = read_json(data_question_match +
                                      'qid_abstractquestion.json')
    train_predicate_qids = collections.defaultdict(list)
    for qid, grounded_graph in train_qid_to_grounded_graph_dict.items():
        # qid='train_'+str(qid.split('-')[1])
        qid = 'train_' + qid
        if qid not in qid_abstractquestions:
            continue
        predicates = []
        for edge in grounded_graph.edges:
            predicates.append(edge.friendly_name)
        predicates.sort()
        predicate = '\t'.join(predicates)
        # print(qid)
        if len(qid_abstractquestions[qid]) > 0:
            # print('hi',qid)
            # abstractquestion = qid_abstractquestions[qid]
            train_predicate_qids[predicate].append(qid)
    write_json(train_predicate_qids,
               data_question_match + 'train_predicate_qids.json')

    test_predicate_qids = collections.defaultdict(list)
    for qid, grounded_graph in test_qid_to_grounded_graph_dict.items():
        # qid = 'test_' + str(qid.split('-')[1])
        qid = 'test_' + qid
        if qid not in qid_abstractquestions:
            continue
        predicates = []
        for edge in grounded_graph.edges:
            predicates.append(edge.friendly_name)
        predicates.sort()
        predicate = '\t'.join(predicates)
        if len(qid_abstractquestions[qid]) > 0:
            # abstractquestion = qid_abstractquestions[qid]
            test_predicate_qids[predicate].append(qid)
    write_json(test_predicate_qids,
               data_question_match + 'test_predicate_qids.json')

    # dev_predicate_qids = collections.defaultdict(list)
    # for qid, grounded_graph in dev_qid_to_grounded_graph_dict.items():
    #     # qid = 'dev_' + str(qid.split('-')[1])
    #     qid='dev_' + qid
    #     if qid not in qid_abstractquestions:
    #         continue
    #     predicates = []
    #     for edge in grounded_graph.edges:
    #         predicates.append(edge.friendly_name)
    #     predicates.sort()
    #     predicate = '\t'.join(predicates)
    #     if len(qid_abstractquestions[qid]) > 0:
    #         # abstractquestion = qid_abstractquestions[qid]
    #         dev_predicate_qids[predicate].append(qid)
    # write_json(dev_predicate_qids, data_question_match + 'dev_predicate_qids.json')

    num_intersect = 0
    # 2718
    for predicate in test_predicate_qids:
        if predicate in train_predicate_qids:
            num_intersect += len(test_predicate_qids[predicate])
    print(num_intersect)
示例#23
0
def write_cache_json():
    write_json(mid_to_names_dict, fn_graph_file.cache_mid_to_names)
def score_testquestion_bert():
    def reverse(path):
        data = read_json(path)
        res = dict()
        for key in data:
            for val in data[key]:
                res[val] = key
        return res

    # def read_abstractquestionpair_pro():
    #     diction = dict()
    #     with open(data_question_match + '09_03_cwq_test_gpu.log', 'r') as f: #'05_10_test.log'
    #         mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
    #         line = mm.readline()
    #         while line:
    #             cols = line.decode().strip().split('\t')
    #             abstractquestion_pair = '\t'.join([cols[0], cols[1]])
    #             if float(cols[3]) > 0:
    #                 diction[abstractquestion_pair] = float(cols[3])
    #             line = mm.readline()
    #     mm.close()
    #     f.close()
    #     return

    def read_abstractquestionpair_pro():
        diction = dict()
        with open(data_question_match + '09_03_cwq_test_gpu.log',
                  'r') as f:  #'05_10_test.log'
            mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
            line = mm.readline()
            while line:
                cols = line.decode().strip().split('\t')
                abstractquestion_pair = '\t'.join([cols[1], cols[2]])
                if float(cols[4]) > 0:
                    diction[abstractquestion_pair] = float(cols[4])
                line = mm.readline()
        mm.close()
        f.close()
        return diction

    abstractquestionpair_pro = read_abstractquestionpair_pro()
    # print(abstractquestionpair_pro)
    testqid_trainqidmax = dict()
    test_qid_trainqid_pro = dict()
    qid_abstractquestion = read_json(data_question_match +
                                     'qid_abstractquestion.json')
    test_2_1 = read_structure_file(test_structure_with_2_1_grounded_graph_file)
    train_2_1 = read_structure_file(
        train_structure_with_2_1_grounded_graph_file)
    test_qid_predicate = reverse(data_question_match +
                                 'test_predicate_qids.json')
    train_qid_predicate = reverse(data_question_match +
                                  'train_predicate_qids.json')

    for one in test_2_1:
        qid = 'test_' + str(one.qid)
        print(qid)
        if qid not in qid_abstractquestion:
            continue
        abstractquestion = qid_abstractquestion[qid]

        trainqid_pro = dict()
        for train_one in train_2_1:
            train_one_qid = 'train_' + str(train_one.qid)
            if train_one_qid not in qid_abstractquestion:
                continue

            train_abstractquestion = qid_abstractquestion[train_one_qid]
            if '\t'.join([abstractquestion,
                          train_abstractquestion]) in abstractquestionpair_pro:
                # print('\t'.join([abstractquestion,train_abstractquestion]))
                sim = abstractquestionpair_pro[('\t'.join(
                    [abstractquestion, train_abstractquestion]))]
                trainqid_pro[train_one_qid] = float(sim)

        trainqid_pro = dict(
            sorted(trainqid_pro.items(), key=lambda d: d[1], reverse=True))
        if len(trainqid_pro) == 0:
            continue

        if qid in test_qid_predicate:
            if list(trainqid_pro.keys())[0] in train_qid_predicate:
                if test_qid_predicate[qid] == train_qid_predicate[list(
                        trainqid_pro.keys())[0]]:
                    print('yeah')

        test_qid_trainqid_pro[qid] = trainqid_pro
        if len(list(trainqid_pro.keys())) > 0:
            testqid_trainqidmax[qid] = list(trainqid_pro.keys())[0]
    write_json(test_qid_trainqid_pro,
               data_question_match + 'test_qid_trainqid_pro_bert')
    write_json(testqid_trainqidmax,
               data_question_match + 'testqid_trainqid_bertmax.json')