示例#1
0
def count_relations_in_KG(wqsp_obj, no_chain_qids):
    print 'Entering count_relations_in_KG() >>>>>>>>>>>>>>>'
    rs = load_relations(literal_relations_file, set())
    frs = load_relations(data_relations_file, rs)
    qid_rel_not_in_KG = set()
    for next_q in wqsp_obj:
        if next_q['QuestionId'] in no_chain_qids:
            continue
        parse_ct, parse_with_good_rels = 0, -1
        for next_parse in next_q['Parses']:
            rel_present = True
            parse_ct += 1
            if next_parse['InferentialChain'] is None or len(
                    next_parse['InferentialChain']) == 0:
                continue
            chain = next_parse['InferentialChain']
            for next_rel in chain:
                nr = 'fb:' + next_rel
                if nr in frs:
                    continue
                rel_present = False
            if rel_present:
                parse_with_good_rels = parse_ct
                break
        if parse_with_good_rels < 0:
            qid_rel_not_in_KG.add(next_q['QuestionId'])
    my_str = bas_utils.to_string(qid_rel_not_in_KG, '\n')
    op_file = open(output_file_chain_not_in_KG, 'w')
    op_file.write(my_str)
    op_file.close()
    print 'len(qid_rel_not_in_KG)=' + str(len(qid_rel_not_in_KG))
    print 'Exiting count_relations_in_KG() <<<<<<<<<<<<<<'
    return qid_rel_not_in_KG
示例#2
0
def analyze_data_json(my_data_obj):
    row_count = len(my_data_obj)
    keys = set()
    max_utterance_length, max_url_length, max_targetValue_length = 0, 0, 0
    for next_row in my_data_obj:
        next_keys = next_row.keys()
        for nk in next_keys:
            keys.add(nk)
        my_utterance_length = len(next_row['utterance'])
        my_url_length = len(next_row['url'])
        my_targetValue_length = len(next_row['targetValue'])
        max_utterance_length = max(my_utterance_length, max_utterance_length)
        max_url_length = max(my_url_length, max_url_length)
        max_targetValue_length = max(my_targetValue_length,
                                     max_targetValue_length)
        if my_utterance_length > 500:
            print next_row['utterance']
        if my_targetValue_length > 200:
            print 'Q: ' + next_row['utterance']
            print 'A: ' + next_row['targetValue']
        if 'type' in next_row['targetValue']:
            print next_row['targetValue']
    str_keys = my_utils.to_string(keys, ', ')
    print 'Row Count = ' + str(row_count)
    print 'Keys = ' + str_keys
    print 'max_utterance_length = ' + str(max_utterance_length)
    print 'max_url_length = ' + str(max_url_length)
    print 'max_targetValue_length = ' + str(max_targetValue_length)
示例#3
0
def check_final(qmap):
    print 'check_final --> Entering'
    my_status = [0, 0, 0, 0, 0]
    for qid in qmap.keys():
        if 'er_status' in qmap[qid].keys():
            my_status[qmap[qid]['er_status']] += 1
        else:
            my_status[0] += 1
        q_obj = qmap[qid]
        rel_seq = q_obj['rel_seq']
        rid_seq = q_obj['rid_seq']
        if len(rel_seq) == len(rid_seq):
            is_ready = 1
        else:
            is_ready = 0
        if 'qm' in q_obj.keys() and is_ready == 1:
            is_ready += 1
        if is_ready > 1 and 'src_ent_id' in q_obj.keys():
            is_ready += 1
        q_obj['is_ready'] = is_ready
        qmap[qid] = q_obj
        if 'er_status' in q_obj.keys() and q_obj['er_status'] == 0:
            print 'ER1 --> ' + qid
    print 'check_final: Summary --> ' + bas_utils.to_string(my_status, ', ')
    print 'check_final --> Exiting'
    return qmap
示例#4
0
def count_no_core_chain(wqsp_obj):
    print 'Entering count_no_core_chain() >>>>>>>>>>>>>>'
    total_quest, ct = 0, 0
    no_chain_qids = set()
    for next_q in wqsp_obj:
        total_quest += 1
        parse_with_chain, parase_ct = -1, -1
        for next_parse in next_q['Parses']:
            parase_ct += 1
            if next_parse['InferentialChain'] is None or len(
                    next_parse['InferentialChain']) == 0:
                continue
            if parse_with_chain == -1:
                parse_with_chain = parase_ct
        if parse_with_chain == -1:
            no_chain_qids.add(next_q['QuestionId'])
        else:
            ct += 1
        if parse_with_chain > 0:
            print next_q['QuestionId'] + ' - parse containing chain = ' + str(
                parse_with_chain)
    print 'InferentialChain=' + str(ct) + '/' + str(total_quest) + ' - ' + str(
        (total_quest - ct))
    print 'len(no_chain_qids)=' + str(len(no_chain_qids))
    my_str = bas_utils.to_string(no_chain_qids, '\n')
    op_file = open(output_file_no_chain, 'w')
    op_file.write(my_str + '\n')
    op_file.close()
    print 'Exiting count_no_core_chain() <<<<<<<<<<<<<<'
    return no_chain_qids
示例#5
0
def write_embd(e_mat, my_mode):
    op_file = open(output_file, my_mode)
    my_words = e_mat.keys()
    for w in my_words:
        next_line = w + ' ' + bas_utils.to_string(e_mat[w]) + '\n'
        op_file.write(next_line)
    op_file.close()
示例#6
0
def er_summary(qid_map, qid_not_in_map):
    my_status = [0, 0, 0, 0, 0]
    for qid in qid_map.keys():
        if 'er_status' in qid_map[qid].keys():
            my_status[qid_map[qid]['er_status']] += 1
        else:
            my_status[0] += 1
            print 'ER1--> ' + qid
    print '2: Summary --> ' + bas_utils.to_string(my_status, ', ')
示例#7
0
def write_embd(wi, e_mat):
    op_file = open(output_pf, 'w')
    my_words = wi.keys()
    for w in my_words:
        idx = wi[w]
        e = e_mat[idx]
        next_line = w + ' ' + bas_utils.to_string(my_list=e,
                                                  separator=' ') + '\n'
        op_file.write(next_line)
    print '\nNew Embedding file written to - ' + output_pf
    op_file.close()
示例#8
0
def save_predictions(my_args, np_y_all_pred, split_name):
    print 'Entering save_predictions() - for ' + split_name
    print 'len(np_y_all_pred)=' + str(len(np_y_all_pred))
    print 'np_y_all_pred.shape=' + str(np_y_all_pred.shape)
    op_file = bas_utils.open_file(os.path.join(my_args.job_folder, split_name + '_pred_embeddings.txt'))
    for i in range(len(np_y_all_pred)):
        next_pred_list = np_y_all_pred[i]
        for pred_e in next_pred_list:
            op_file.write(bas_utils.to_string(pred_e, ',') + ';')
        op_file.write('\n')
    op_file.close()
    print 'Exiting save_predictions() - for ' + split_name
示例#9
0
def get_missing_rels(s1, s2):
    my_set = set()
    for i in s1:
        if i in s2:
            continue
        else:
            my_set.add(i)
    new_rels = bas_utils.to_string(my_set, '\n')
    op_file = open(output_file_name, 'w')
    op_file.write(new_rels)
    op_file.close()
    print 'output written to ' + output_file_name
示例#10
0
def get_qid_rel_map(web_qsp_obj):
    qid_rel_map = list()
    for nxt_obj in web_qsp_obj['Questions']:
        qid = nxt_obj['QuestionId']
        chain = nxt_obj['Parses'][0]['InferentialChain']
        if chain is None or len(chain) < 1:
            continue
        for r in chain:
            l = qid + ',' + r
            qid_rel_map.append(l)
    my_str = bas_utils.to_string(qid_rel_map, '\n')
    op_file = open(output_file, 'w')
    op_file.write(my_str + '\n')
    op_file.close()
    print '\nOutput stored in file - ' + output_file
示例#11
0
def ere_walk(g, all_node_nbrs, walk_length, start_node, walk, op_file):
    '''
    Simulate a random walk starting from start node.
    '''
    while len(walk) < walk_length:
        cur = walk[-1]
        all_node_nbrs, cur_nbrs = get_neighbors(g, all_node_nbrs, cur)
        if len(cur_nbrs) > 0:
            if len(cur_nbrs) == 1:
                to_traverse = 0
            else:
                to_traverse = np.random.randint(low=0, high=len(cur_nbrs) - 1)
            next_rel = cur_nbrs[to_traverse][0]
            next_ent = cur_nbrs[to_traverse][1]
            walk.append(next_rel)
            walk.append(next_ent)
        else:
            break
    if op_file is None:
        return walk
    else:
        op_file.write(bas_utils.to_string(walk, ' ') + '\n')
    return walk
示例#12
0
def map_webq_kbqa():
    training_data = read_data(
        os.path.join(web_questions_path, 'webquestions.examples.train.json'))
    test_data = read_data(
        os.path.join(web_questions_path, 'webquestions.examples.train.json'))
    webq_list = merge_data(training_data, test_data)
    trn_ent = load_entities(
        'webquestions.examples.train.e2e.top10.filter.sid.tsv', dict())
    given_ent = load_entities(
        'webquestions.examples.test.e2e.top10.filter.sid.tsv', trn_ent)
    all_ent = load_entities('remain.tsv', given_ent)
    kbqa_list = kbqa_prog.get_kbqa_data()
    for next_webqa in webq_list:
        print next_webqa['id'] + '-' + ub.to_string(all_ent[next_webqa['id']])
        for next_kbqa in kbqa_list:
            wq = re.sub('[^a-zA-Z0-9 ]', '', next_webqa['utterance'])
            is_matched = match_question_pattern(wq, next_kbqa['ques'],
                                                all_ent[next_webqa['id']])
            if is_matched and 'kbqa_id' in next_webqa.keys():
                next_webqa[
                    'kbqa_id'] = next_webqa['kbqa_id'] + ';' + next_kbqa['id']
            if is_matched:
                next_webqa['kbqa_id'] = next_kbqa['id']
示例#13
0
def main():
    nm = load_node_map()
    rm = load_rel_map()
    trn_path_file = os.path.join(data_folder, trn_file)
    trn_json = get_my_json(trn_path_file)  #
    trn_map = convert_list_2_dict(trn_json)
    trn_mention_pf = os.path.join(stagg_home, trn_mention)
    # Include 'qm' in the json, if src ent present in node-map
    trn_map_nid = replace_ent_in_question(trn_mention_pf, trn_map, nm)  #
    trn_map_rid, unfound_rels = add_rel_id(rm, trn_map_nid, set())
    trn_map_src = add_src_ids(trn_map_rid, nm)
    trn_map_post = post_process(trn_map_src)
    trn_map_final = check_final(trn_map_post)
    final_trn_list = convert_dict_2_list(trn_map_final)
    print_json(os.path.join(output_path, 'all.kgt_trn.json'), final_trn_list)
    trn, val = split_train_valid(final_trn_list)
    print_json(os.path.join(output_path, 'kgt_trn.json'), trn)
    print_json(os.path.join(output_path, 'kgt_val.json'), val)

    tst_path_file = os.path.join(data_folder, tst_file)
    tst_json = get_my_json(tst_path_file)
    tst_map = convert_list_2_dict(tst_json)
    tst_mention_pf = os.path.join(stagg_home, tst_mention)
    tst_map_nid = replace_ent_in_question(tst_mention_pf, tst_map, nm)
    tst_map_rid, unfound_rels = add_rel_id(rm, tst_map_nid, unfound_rels)
    tst_map_src = add_src_ids(tst_map_rid, nm)
    tst_map_final = check_final(tst_map_src)
    final_tst_list = convert_dict_2_list(tst_map_final)
    print_json(os.path.join(output_path, 'kgt_tst.json'), final_tst_list)
    print 'len(unfound_rels)=' + str(len(unfound_rels))
    new_rel_str = bas_utils.to_string(unfound_rels, '\n')
    f = open(os.path.join(output_path, 'new-rels.txt'), 'w')
    f.write(new_rel_str + '\n')
    f.close()

    all_q = final_trn_list + final_tst_list
    print_quest_4_w2v(all_q)
示例#14
0
def get_ere_embed(wi, random_embeddings):
    print '\nEntering get_ere_embed() - len(wi)=' + str(len(wi.keys()))
    ct, i, idx, word_list = 0, 0, 0, set(wi.keys())
    embedding_matrix = np.zeros((len(wi) + 1, 300))
    with open(ere_embedding_file) as f:
        content = f.readlines()
    for next_line in content:
        i += 1
        msg = 'i=' + str(i) + ', idx=' + str(idx) + ' / ' + str(len(wi))
        bas_utils.print_status(msg + '  get_ere_embed()', ct, 1)
        my_words = wi.keys()
        toks = next_line.split()
        if len(toks) < 5:
            continue
        if toks[0] not in my_words:
            continue
        word_list.remove(toks[0])
        ct += 1
        idx = wi[toks[0]]
        embedding_matrix[idx] = np.asarray(toks[1:], dtype=float)
    print '\nWord count not in ERE=' + str(len(word_list)) + ', e.g., ' + list(
        word_list)[0] + ', ' + list(word_list)[1]
    print '\n - Starting to add Random Embedding after ct=' + str(ct)
    final_pending = set(word_list)
    for w in word_list:
        if not str(w).startswith('r'):
            continue
        print 'Adding Random Embedding for - ' + w
        idx = wi[w]
        e = random_embeddings.pop()
        embedding_matrix[idx] = np.asarray(e, dtype=float)
        ct += 1
        final_pending.remove(w)
    print '\nWord count not in final_embd=' + str(
        len(final_pending)) + ' - ' + bas_utils.to_string(final_pending, ' ')
    print '\nct=' + str(ct) + ', i=' + str(i)
    return embedding_matrix
示例#15
0
def print_embd(merged_embd, opf):
    op_file = open(opf, 'w')
    for w in merged_embd.keys():
        op_file.write(w + ' ' + bas_utils.to_string(merged_embd[w], ' ') + '\n')
    op_file.close()