示例#1
0
def load_data_json(my_args):
    print 'load_data_from_json() Loading Questions from Path - ' + my_args.input_path
    x_all, y_all, src_ent_all, max_sen_len, max_op_seq_length, json_objs, qid_seqs = [], [], [], 0, 0, list(), list()
    for fn in fns:
        queries, ans_seq, src_ents, qid_seq = [], [], [], list()
        json_obj = bas_utils.load_json_file(
            os.path.join(my_args.input_path, fn))
        json_objs.append(bas_utils.convert_list_2_dict(json_obj, 'qid'))
        for nextq in json_obj:
            if nextq['is_ready'] < 2:
                continue
            if 'qm' not in nextq.keys():
                print 'No QM - in ' + nextq['qid']
            queries.append(nextq['qm'])
            se = nextq['src_ent_id']
            if not isinstance(se, list):
                se_list = [se]
            else:
                se_list = se
            src_ents.append(se_list)
            ans_seq.append(nextq['rid_seq'])
            max_sen_len = max(len(str(nextq['qm']).split()), max_sen_len)
            max_op_seq_length = max(len(nextq['rid_seq']), max_op_seq_length)
            qid_seq.append(nextq['qid'])
        x_all.append(queries)
        y_all.append(ans_seq)
        src_ent_all.append(src_ents)
        qid_seqs.append(qid_seq)
    print '\n\nInput Sequence Length = ' + str(max_sen_len)
    return x_all, y_all, src_ent_all, max_sen_len, max_op_seq_length, json_objs, qid_seqs
示例#2
0
def get_my_json(web_qsp_file):
    print '1: web_qsp_file=' + web_qsp_file
    my_list = list()
    mid2sid = load_sid_mid_mapping()
    json_obj = bas_utils.load_json_file(web_qsp_file)
    no_src_ct, no_chain_ct, no_src_ent_in_kb_ct, total_ct = 0, 0, 0, 0
    pf = os.path.basename(web_qsp_file)
    no_src_file = open(os.path.join(output_path, 'no_src-' + pf + '.txt'), 'w')
    no_chain_file = open(os.path.join(output_path, 'no_chain-' + pf + '.txt'),
                         'w')
    no_src_in_kb_file = open(
        os.path.join(output_path, 'no_src_in_kb-' + pf + '.txt'), 'w')
    for next_q in json_obj['Questions']:
        total_ct += 1
        if next_q['Parses'][0]['TopicEntityMid'] is None or len(
                next_q['Parses'][0]['TopicEntityMid']) < 1:
            no_src_ct += 1
            no_src_file.write(next_q['QuestionId'] + '\n')
            continue
        if next_q['Parses'][0]['InferentialChain'] is None or len(
                next_q['Parses'][0]['InferentialChain']) < 1:
            no_chain_file.write(next_q['QuestionId'] + '\n')
            no_chain_ct += 1
            continue
        src_ent_mid = 'fb:' + next_q['Parses'][0]['TopicEntityMid']
        myq = dict()
        if src_ent_mid not in mid2sid.keys():
            no_src_ent_in_kb_ct += 1
            no_src_in_kb_file.write(next_q['QuestionId'] + '\n')
            myq['src_ent'] = ''
        else:
            myq['src_ent'] = mid2sid[src_ent_mid]
        myq['qid'] = next_q['QuestionId']
        myq['qo'] = next_q['RawQuestion'].encode('utf8').lower()
        #myq['src_ent'] = mid2sid[src_ent_mid]
        myq['rel_seq'] = next_q['Parses'][0]['InferentialChain']
        myq['PotentialTopicEntityMention'] = next_q['Parses'][0][
            'PotentialTopicEntityMention']
        myq['TopicEntityName'] = next_q['Parses'][0]['TopicEntityName']
        my_list.append(myq)
    no_src_file.close()
    no_chain_file.close()
    no_src_in_kb_file.close()
    print '\n1: get_my_json(): no_src_ct(Rejected)=' + str(no_src_ct)
    print '1: get_my_json(): no_chain_ct(Rejected)=' + str(no_chain_ct)
    print '1: get_my_json(): no_src_ent_in_kb_ct=' + str(no_src_ent_in_kb_ct)
    print '1: get_my_json(): Total Rejected =' + str(
        int(no_src_ct + no_chain_ct))
    print '1: get_my_json(): ' + str(web_qsp_file) + ' --> ' + str(
        len(my_list)) + ' / ' + str(total_ct)
    return my_list
示例#3
0
def get_initial_list():
    next_file = os.path.join(data_folder, tst_file)
    print 'next_file=' + next_file
    q_list = bas_utils.load_json_file(next_file)['Questions']
    print '\nLoaded...'
    qid_set = set()
    for q in q_list:
        qid_set.add(q['QuestionId'])
    output_path_file = os.path.join(data_folder, next_file + '.qid')
    op_file = open(output_path_file, 'w')
    for i in range(0, max_ct, 1):
        qid = qid_prefix + str(i)
        if qid in qid_set:
            op_file.write(qid + '\n')
    op_file.close()
示例#4
0
def get_neigbourgood_ents(my_args, g, ent, top_k_rels, all_rel_ids):
    """
    Goal of this function is to find those entities, that are connected to ent, through any of the relations
    present in top_k_rels.
    :param my_args:
    :param g:
    :param ent:
    :param top_k_rels:
    :return: A List of entities for every relation in the top-k relations, as a result it returns list of lists
    """
    ent_list, ct = list(), 0
    for nxt_rel in top_k_rels:
        rel = nxt_rel.split(',')[-1:][0]
        ct += 1
        if ct > my_args.p['beam_size']:
            break
        sd_folder = os.path.abspath(
            os.path.join(my_args.job_folder, os.pardir, 'saved_data'))
        key = ent + ';' + rel
        global nbr_ent_cache
        if nbr_ent_cache is None and not os.path.isfile(
                os.path.join(sd_folder, 'nbr_ent_cache.json')):
            nbr_ent_cache = dict()
        elif nbr_ent_cache is None and os.path.isfile(
                os.path.join(sd_folder, 'nbr_ent_cache.json')):
            nbr_ent_cache = bas_utils.load_json_file(
                os.path.join(sd_folder, 'nbr_ent_cache.json'))
        if key in nbr_ent_cache.keys():
            ent_list.append(set(nbr_ent_cache[key]))
            continue
        rel_nbr_list = list()
        if ent == '_MYENTMENTION_':
            rel_nbr_list.append('_MYENTMENTION_')
        else:
            for nbr in g.neighbors(ent):
                rels = g[ent][nbr]['relation']
                rel_set = set(rels.split('_'))
                if rel in rel_set:
                    rel_nbr_list.append(nbr)
        nbr_set = list(set(rel_nbr_list))
        nbr_ent_cache[key] = nbr_set
        ent_list.append(nbr_set)
    return ent_list
示例#5
0
def load_saved_data(this_args):
    if 'saved_data' in this_args.p.keys():
        op_path = os.path.abspath(this_args.p['saved_data'])
    else:
        op_path = os.path.abspath(
            os.path.join(this_args.job_folder, os.pardir, 'saved_data'))
    word_index = bas_utils.load_pickle_file(
        os.path.join(op_path, 'word_index.pickle'))
    embed_mat = bas_utils.load_pickle_file(
        os.path.join(op_path, 'embedding_matrix.pickle'))
    rel_mat = bas_utils.load_pickle_file(
        os.path.join(op_path, 'rel_embd_mat.pickle'))
    x_all_padded = bas_utils.load_pickle_file(
        os.path.join(op_path, 'x_all_padded.pickle'))
    np_y_all = bas_utils.load_pickle_file(
        os.path.join(op_path, 'np_y_all.pickle'))
    y_all = bas_utils.load_pickle_file(os.path.join(op_path, 'y_all.pickle'))
    src_ent_all = bas_utils.load_pickle_file(
        os.path.join(op_path, 'src_ent_all.pickle'))
    rel_tok_seqs = bas_utils.load_pickle_file(
        os.path.join(op_path, 'rel_tok_seqs.pickle'))
    rel_ids = bas_utils.load_pickle_file(
        os.path.join(op_path, 'rel_ids.pickle'))
    embd_dim, max_sen_len, max_osl = 0, 0, 0
    with open(os.path.join(op_path, 'vars.txt')) as f:
        content = f.readlines()
    for next_line in content:
        if next_line.startswith('embd_dim='):
            embd_dim = int(next_line.split('=')[1].replace('\n', ''))
        if next_line.startswith('max_sen_len='):
            max_sen_len = int(next_line.split('=')[1].replace('\n', ''))
        if next_line.startswith('max_osl='):
            max_osl = int(next_line.split('=')[1].replace('\n', ''))
    jos = list()
    for fn in fns:
        json_obj = bas_utils.load_json_file(os.path.join(op_path, fn))
        jos.append(json_obj)
    qs = bas_utils.load_pickle_file(os.path.join(op_path, 'qid_sqs.pickle'))
    print 'Returning from Saved Data ...'
    return embd_dim, word_index, embed_mat, rel_mat, max_sen_len, x_all_padded, np_y_all, y_all, src_ent_all, max_osl, jos, \
           qs, rel_tok_seqs, rel_ids
示例#6
0
input_folder = ''
#output_file = '/data/Work-Homes/LOD_HOME/FB_SEMPRE/create-subgraph/docs/qid-rel-map-trn.txt'
output_file = '/data/Work-Homes/LOD_HOME/FB_SEMPRE/create-subgraph/docs/qid-rel-map-tst.txt'

# Copy the output file to /data/Work-Homes/LOD_HOME/WebQSP/anal


def get_qid_rel_map(web_qsp_obj):
    qid_rel_map = list()
    for nxt_obj in web_qsp_obj['Questions']:
        qid = nxt_obj['QuestionId']
        chain = nxt_obj['Parses'][0]['InferentialChain']
        if chain is None or len(chain) < 1:
            continue
        for r in chain:
            l = qid + ',' + r
            qid_rel_map.append(l)
    my_str = bas_utils.to_string(qid_rel_map, '\n')
    op_file = open(output_file, 'w')
    op_file.write(my_str + '\n')
    op_file.close()
    print '\nOutput stored in file - ' + output_file



if __name__ == '__main__':
    #trn_obj = bas_utils.load_json_file(os.path.join(data_folder, trn_file))
    trn_obj = bas_utils.load_json_file(os.path.join(data_folder, tst_file))
    get_qid_rel_map(trn_obj)
示例#7
0
import os
import utils.basics as bas_utils

working_dir = '/data/Work-Homes/LOD_HOME/web-questions/data'

for next_file in os.listdir(working_dir):
    if not next_file.endswith('json'):
        continue
    print 'next_file=' + next_file
    my_subs = set()
    obj1 = bas_utils.load_json_file(os.path.join(working_dir, next_file))
    output_path_file = os.path.join(working_dir, next_file + '.txt')
    op_file = open(output_path_file, 'w')
    for next_sub in obj1:
        op_file.write(next_sub['utterance'] + '\n')
    op_file.close()