def load_data_json(my_args): print 'load_data_from_json() Loading Questions from Path - ' + my_args.input_path x_all, y_all, src_ent_all, max_sen_len, max_op_seq_length, json_objs, qid_seqs = [], [], [], 0, 0, list(), list() for fn in fns: queries, ans_seq, src_ents, qid_seq = [], [], [], list() json_obj = bas_utils.load_json_file( os.path.join(my_args.input_path, fn)) json_objs.append(bas_utils.convert_list_2_dict(json_obj, 'qid')) for nextq in json_obj: if nextq['is_ready'] < 2: continue if 'qm' not in nextq.keys(): print 'No QM - in ' + nextq['qid'] queries.append(nextq['qm']) se = nextq['src_ent_id'] if not isinstance(se, list): se_list = [se] else: se_list = se src_ents.append(se_list) ans_seq.append(nextq['rid_seq']) max_sen_len = max(len(str(nextq['qm']).split()), max_sen_len) max_op_seq_length = max(len(nextq['rid_seq']), max_op_seq_length) qid_seq.append(nextq['qid']) x_all.append(queries) y_all.append(ans_seq) src_ent_all.append(src_ents) qid_seqs.append(qid_seq) print '\n\nInput Sequence Length = ' + str(max_sen_len) return x_all, y_all, src_ent_all, max_sen_len, max_op_seq_length, json_objs, qid_seqs
def get_my_json(web_qsp_file): print '1: web_qsp_file=' + web_qsp_file my_list = list() mid2sid = load_sid_mid_mapping() json_obj = bas_utils.load_json_file(web_qsp_file) no_src_ct, no_chain_ct, no_src_ent_in_kb_ct, total_ct = 0, 0, 0, 0 pf = os.path.basename(web_qsp_file) no_src_file = open(os.path.join(output_path, 'no_src-' + pf + '.txt'), 'w') no_chain_file = open(os.path.join(output_path, 'no_chain-' + pf + '.txt'), 'w') no_src_in_kb_file = open( os.path.join(output_path, 'no_src_in_kb-' + pf + '.txt'), 'w') for next_q in json_obj['Questions']: total_ct += 1 if next_q['Parses'][0]['TopicEntityMid'] is None or len( next_q['Parses'][0]['TopicEntityMid']) < 1: no_src_ct += 1 no_src_file.write(next_q['QuestionId'] + '\n') continue if next_q['Parses'][0]['InferentialChain'] is None or len( next_q['Parses'][0]['InferentialChain']) < 1: no_chain_file.write(next_q['QuestionId'] + '\n') no_chain_ct += 1 continue src_ent_mid = 'fb:' + next_q['Parses'][0]['TopicEntityMid'] myq = dict() if src_ent_mid not in mid2sid.keys(): no_src_ent_in_kb_ct += 1 no_src_in_kb_file.write(next_q['QuestionId'] + '\n') myq['src_ent'] = '' else: myq['src_ent'] = mid2sid[src_ent_mid] myq['qid'] = next_q['QuestionId'] myq['qo'] = next_q['RawQuestion'].encode('utf8').lower() #myq['src_ent'] = mid2sid[src_ent_mid] myq['rel_seq'] = next_q['Parses'][0]['InferentialChain'] myq['PotentialTopicEntityMention'] = next_q['Parses'][0][ 'PotentialTopicEntityMention'] myq['TopicEntityName'] = next_q['Parses'][0]['TopicEntityName'] my_list.append(myq) no_src_file.close() no_chain_file.close() no_src_in_kb_file.close() print '\n1: get_my_json(): no_src_ct(Rejected)=' + str(no_src_ct) print '1: get_my_json(): no_chain_ct(Rejected)=' + str(no_chain_ct) print '1: get_my_json(): no_src_ent_in_kb_ct=' + str(no_src_ent_in_kb_ct) print '1: get_my_json(): Total Rejected =' + str( int(no_src_ct + no_chain_ct)) print '1: get_my_json(): ' + str(web_qsp_file) + ' --> ' + str( len(my_list)) + ' / ' + str(total_ct) return my_list
def get_initial_list(): next_file = os.path.join(data_folder, tst_file) print 'next_file=' + next_file q_list = bas_utils.load_json_file(next_file)['Questions'] print '\nLoaded...' qid_set = set() for q in q_list: qid_set.add(q['QuestionId']) output_path_file = os.path.join(data_folder, next_file + '.qid') op_file = open(output_path_file, 'w') for i in range(0, max_ct, 1): qid = qid_prefix + str(i) if qid in qid_set: op_file.write(qid + '\n') op_file.close()
def get_neigbourgood_ents(my_args, g, ent, top_k_rels, all_rel_ids): """ Goal of this function is to find those entities, that are connected to ent, through any of the relations present in top_k_rels. :param my_args: :param g: :param ent: :param top_k_rels: :return: A List of entities for every relation in the top-k relations, as a result it returns list of lists """ ent_list, ct = list(), 0 for nxt_rel in top_k_rels: rel = nxt_rel.split(',')[-1:][0] ct += 1 if ct > my_args.p['beam_size']: break sd_folder = os.path.abspath( os.path.join(my_args.job_folder, os.pardir, 'saved_data')) key = ent + ';' + rel global nbr_ent_cache if nbr_ent_cache is None and not os.path.isfile( os.path.join(sd_folder, 'nbr_ent_cache.json')): nbr_ent_cache = dict() elif nbr_ent_cache is None and os.path.isfile( os.path.join(sd_folder, 'nbr_ent_cache.json')): nbr_ent_cache = bas_utils.load_json_file( os.path.join(sd_folder, 'nbr_ent_cache.json')) if key in nbr_ent_cache.keys(): ent_list.append(set(nbr_ent_cache[key])) continue rel_nbr_list = list() if ent == '_MYENTMENTION_': rel_nbr_list.append('_MYENTMENTION_') else: for nbr in g.neighbors(ent): rels = g[ent][nbr]['relation'] rel_set = set(rels.split('_')) if rel in rel_set: rel_nbr_list.append(nbr) nbr_set = list(set(rel_nbr_list)) nbr_ent_cache[key] = nbr_set ent_list.append(nbr_set) return ent_list
def load_saved_data(this_args): if 'saved_data' in this_args.p.keys(): op_path = os.path.abspath(this_args.p['saved_data']) else: op_path = os.path.abspath( os.path.join(this_args.job_folder, os.pardir, 'saved_data')) word_index = bas_utils.load_pickle_file( os.path.join(op_path, 'word_index.pickle')) embed_mat = bas_utils.load_pickle_file( os.path.join(op_path, 'embedding_matrix.pickle')) rel_mat = bas_utils.load_pickle_file( os.path.join(op_path, 'rel_embd_mat.pickle')) x_all_padded = bas_utils.load_pickle_file( os.path.join(op_path, 'x_all_padded.pickle')) np_y_all = bas_utils.load_pickle_file( os.path.join(op_path, 'np_y_all.pickle')) y_all = bas_utils.load_pickle_file(os.path.join(op_path, 'y_all.pickle')) src_ent_all = bas_utils.load_pickle_file( os.path.join(op_path, 'src_ent_all.pickle')) rel_tok_seqs = bas_utils.load_pickle_file( os.path.join(op_path, 'rel_tok_seqs.pickle')) rel_ids = bas_utils.load_pickle_file( os.path.join(op_path, 'rel_ids.pickle')) embd_dim, max_sen_len, max_osl = 0, 0, 0 with open(os.path.join(op_path, 'vars.txt')) as f: content = f.readlines() for next_line in content: if next_line.startswith('embd_dim='): embd_dim = int(next_line.split('=')[1].replace('\n', '')) if next_line.startswith('max_sen_len='): max_sen_len = int(next_line.split('=')[1].replace('\n', '')) if next_line.startswith('max_osl='): max_osl = int(next_line.split('=')[1].replace('\n', '')) jos = list() for fn in fns: json_obj = bas_utils.load_json_file(os.path.join(op_path, fn)) jos.append(json_obj) qs = bas_utils.load_pickle_file(os.path.join(op_path, 'qid_sqs.pickle')) print 'Returning from Saved Data ...' return embd_dim, word_index, embed_mat, rel_mat, max_sen_len, x_all_padded, np_y_all, y_all, src_ent_all, max_osl, jos, \ qs, rel_tok_seqs, rel_ids
input_folder = '' #output_file = '/data/Work-Homes/LOD_HOME/FB_SEMPRE/create-subgraph/docs/qid-rel-map-trn.txt' output_file = '/data/Work-Homes/LOD_HOME/FB_SEMPRE/create-subgraph/docs/qid-rel-map-tst.txt' # Copy the output file to /data/Work-Homes/LOD_HOME/WebQSP/anal def get_qid_rel_map(web_qsp_obj): qid_rel_map = list() for nxt_obj in web_qsp_obj['Questions']: qid = nxt_obj['QuestionId'] chain = nxt_obj['Parses'][0]['InferentialChain'] if chain is None or len(chain) < 1: continue for r in chain: l = qid + ',' + r qid_rel_map.append(l) my_str = bas_utils.to_string(qid_rel_map, '\n') op_file = open(output_file, 'w') op_file.write(my_str + '\n') op_file.close() print '\nOutput stored in file - ' + output_file if __name__ == '__main__': #trn_obj = bas_utils.load_json_file(os.path.join(data_folder, trn_file)) trn_obj = bas_utils.load_json_file(os.path.join(data_folder, tst_file)) get_qid_rel_map(trn_obj)
import os import utils.basics as bas_utils working_dir = '/data/Work-Homes/LOD_HOME/web-questions/data' for next_file in os.listdir(working_dir): if not next_file.endswith('json'): continue print 'next_file=' + next_file my_subs = set() obj1 = bas_utils.load_json_file(os.path.join(working_dir, next_file)) output_path_file = os.path.join(working_dir, next_file + '.txt') op_file = open(output_path_file, 'w') for next_sub in obj1: op_file.write(next_sub['utterance'] + '\n') op_file.close()