def _load_linkings(self, links_fp): with codecs.open(links_fp, 'r', 'utf-8') as br: for line in br.readlines(): if line.startswith('#'): continue spt = line.strip().split('\t') q_idx, st, ed, mention, mid, wiki_name, feats = spt q_idx = int(q_idx) st = int(st) ed = int(ed) feat_dict = json.loads(feats) for k in feat_dict: v = float('%.6f' % feat_dict[k]) feat_dict[k] = v link_data = LinkData(category='Entity', start=st, end=ed, mention=mention, comp='==', value=mid, name=wiki_name, link_feat=feat_dict) self.p_links_dict.setdefault(q_idx, []).append(link_data) LogInfo.logs('%d questions of link data loaded.', len(self.p_links_dict))
def _load_pop_dict(self, entity_pop_fp, type_pop_fp): for pop_fp in [entity_pop_fp, type_pop_fp]: LogInfo.logs('Reading popularity from %s ...', pop_fp) with codecs.open(pop_fp, 'r', 'utf-8') as br: for line in br.readlines(): spt = line.strip().split('\t') self.pop_dict[spt[0]] = int(spt[1]) LogInfo.logs('%d <mid, popularity> loaded.', len(self.pop_dict))
def _load_pred(self, pred_name_fp): with codecs.open(pred_name_fp, 'r', 'utf-8') as br: for line in br.readlines(): spt = line.strip().split('\t') if len(spt) < 2: continue self.pred_set.add(spt[0]) LogInfo.logs('%d predicates scanned.', len(self.pred_set))
def single_post_candgen(self, p_idx, post, link_fp, schema_fp): # =================== Linking first ==================== # if os.path.isfile(link_fp): gather_linkings = [] with codecs.open(link_fp, 'r', 'utf-8') as br: for line in br.readlines(): tup_list = json.loads(line.strip()) ld_dict = {k: v for k, v in tup_list} gather_linkings.append(LinkData(**ld_dict)) else: gather_linkings = self.p_links_dict.get(p_idx, []) for idx in range(len(gather_linkings)): gather_linkings[idx].gl_pos = idx # ==================== Save linking results ================ # if not os.path.isfile(link_fp): with codecs.open(link_fp + '.tmp', 'w', 'utf-8') as bw: for gl in gather_linkings: bw.write(json.dumps(gl.serialize()) + '\n') shutil.move(link_fp + '.tmp', link_fp) # ===================== simple predicate finding ===================== # sc_list = [] for gl_data in gather_linkings: entity = gl_data.value pred_set = self.subj_pred_dict.get(entity, set([])) for pred in pred_set: sc = Schema() sc.hops = 1 sc.main_pred_seq = [pred] sc.raw_paths = [('Main', gl_data, [pred])] sc.ans_size = 1 sc_list.append(sc) if len(sc_list) == 0: LogInfo.logs( "=============q_idx: %d sc_list=0======================" % p_idx) # ==================== Save schema results ================ # # ans_size, hops, raw_paths # raw_paths: (category, gl_pos, gl_mid, pred_seq) with codecs.open(schema_fp + '.tmp', 'w', 'utf-8') as bw: for sc in sc_list: sc_info_dict = { k: getattr(sc, k) for k in ('ans_size', 'hops') } opt_raw_paths = [] for cate, gl, pred_seq in sc.raw_paths: opt_raw_paths.append((cate, gl.gl_pos, gl.value, pred_seq)) sc_info_dict['raw_paths'] = opt_raw_paths bw.write(json.dumps(sc_info_dict) + '\n') shutil.move(schema_fp + '.tmp', schema_fp)
def _load_fb_subset(self, freebase_fp): LogInfo.begin_track('Loading freebase subset from [%s] ...', freebase_fp) prefix = 'www.freebase.com/' pref_len = len(prefix) with codecs.open(freebase_fp, 'r', 'utf-8') as br: lines = br.readlines() LogInfo.logs('%d lines loaded.', len(lines)) for line_idx, line in enumerate(lines): if line_idx > 0 and line_idx % 500000 == 0: LogInfo.logs('Current: %d / %d', line_idx, len(lines)) s, p, _ = line.strip().split('\t') s = s[pref_len:].replace('/', '.') self.subj_pred_keys.add(s) LogInfo.logs('%d related entities loaded.', len(self.subj_pred_keys)) LogInfo.end_track()
def _load_type(self, type_name_fp): with codecs.open(type_name_fp, 'r', 'utf-8') as br: for line in br.readlines(): spt = line.strip().split('\t') if len(spt) < 2: continue type_mid, type_name = spt[0], spt[1] surface = type_name.lower().replace('(s)', '') type_prefix = type_mid[:type_mid.find('.')] if type_prefix not in self.skip_domain_set: self.surface_mid_dict.setdefault(surface, set([])).add(type_mid) self.mid_name_dict[type_mid] = type_name self.type_set.add(type_mid) LogInfo.logs('After scanning %d types, %d <surface, mid_set> loaded.', len(self.type_set), len(self.surface_mid_dict))
def _load_fb_subset(self, fb_fp): LogInfo.begin_track('Loading freebase subset from [%s] ...', fb_fp) prefix = 'www.freebase.com/' pref_len = len(prefix) with codecs.open(fb_fp, 'r', 'utf-8') as br: lines = br.readlines() LogInfo.logs('%d lines loaded.', len(lines)) for line_idx, line in enumerate(lines): if line_idx % 500000 == 0: LogInfo.logs('Current: %d / %d', line_idx, len(lines)) s, p, _ = line.strip().split('\t') s = s[pref_len:].replace('/', '.') p = p[pref_len:].replace('/', '.') self.subj_pred_dict.setdefault(s, set([])).add(p) LogInfo.logs('%d related entities and %d <S, P> pairs saved.', len(self.subj_pred_dict), sum([len(v) for v in self.subj_pred_dict.values()])) LogInfo.end_track()
def main(args): if args.data_name == "SimpQ": qa_list = load_simpq(args.data_dir) output_file = "%s/SimpQ.all.links" % args.data_dir else: qa_list = load_reddit(args.data_dir, mode=args.mode) output_file = "%s/Reddit.%s.links" % (args.data_dir, args.mode) freebase_path = "%s/freebase-FB2M.txt" % args.fb_dir mid_name_path = "%s/S-NAP-ENO-triple.txt" % args.fb_meta_dir type_name_path = "%s/TS-name.txt" % args.fb_meta_dir pred_name_path = "%s/PS-name.txt" % args.fb_meta_dir entity_pop_path = "%s/entity_pop_5m.txt" % args.fb_meta_dir type_pop_path = "%s/type_pop.txt" % args.fb_meta_dir linker = LukovLinker(freebase_fp=freebase_path, mid_name_fp=mid_name_path, type_name_fp=type_name_path, pred_name_fp=pred_name_path, entity_pop_fp=entity_pop_path, type_pop_fp=type_pop_path) LogInfo.begin_track('Linking data save to: %s' % output_file) with codecs.open(output_file, 'w', 'utf-8') as bw: for q_idx, qa in enumerate(qa_list): q_tokens = qa['tokens'] if q_idx > 0 and q_idx % 10000 == 0: LogInfo.logs('Entering Q-%d', q_idx) tup = linker.link_single_question(q_tokens) bw.write('%04d\t%d\t%d\t%s\t%s\t%s\t%s\n' % (q_idx, tup.start, tup.end, tup.mention, tup.mid, tup.name, json.dumps(tup.feat_dict))) LogInfo.end_track()
def load_reddit(data_dir, mode='train'): LogInfo.logs('Reddit initializing ... ') dg_list = [] corenlp = StanfordCoreNLP(CORENLP_PATH) fp = '%s/%s_v3.txt' % (data_dir, mode) with open(fp, 'r') as br: for line in br: dg_line = json.loads(line) dialog = { 'utterance': dg_line['post'].strip(), 'tokens': dg_line['post'].split(), 'parse': corenlp.dependency_parse(dg_line['post']), 'response': dg_line['response'].strip(), 'corr_responses': dg_line['corr_responses'], 'all_triples': dg_line['all_triples'], 'all_entities': dg_line['all_entities'] } dg_list.append(dialog) if len(dg_list) % 10000 == 0: LogInfo.logs('%d scanned.', len(dg_list)) pickle_fp = '%s/Reddit.%s.pkl' % (data_dir, mode) with open(pickle_fp, 'wb') as bw: pickle.dump(dg_list, bw) LogInfo.logs('%d Reddit saved in [%s].' % (len(dg_list), pickle_fp)) return dg_list
def load_simpq(data_dir): LogInfo.logs('SimpQ initializing ... ') qa_list = [] corenlp = StanfordCoreNLP(CORENLP_PATH) for Tvt in ('train', 'valid', 'test'): fp = '%s/annotated_fb_data_%s.txt' % (data_dir, Tvt) with codecs.open(fp, 'r', 'utf-8') as br: for line in br.readlines(): qa = {} s, p, o, q = line.strip().split('\t') s = _remove_simpq_header(s) p = _remove_simpq_header(p) o = _remove_simpq_header(o) qa['utterance'] = q qa['targetValue'] = (s, p, o) # different from other datasets qa['tokens'] = corenlp.word_tokenize(qa['utterance']) qa['parse'] = corenlp.dependency_parse(qa['utterance']) qa_list.append(qa) if len(qa_list) % 1000 == 0: LogInfo.logs('%d scanned.', len(qa_list)) pickle_fp = '%s/simpQ.data.pkl' % data_dir with open(pickle_fp, 'wb') as bw: pickle.dump(qa_list, bw) LogInfo.logs('%d SimpleQuestions loaded.' % len(qa_list)) return qa_list
def main(args): data_path = "%s/Reddit.%s.pkl" % (args.data_dir, args.mode) freebase_path = "%s/freebase-FB2M.txt" % args.freebase_dir links_path = "%s/Reddit.%s.links" % (args.data_dir, args.mode) with open(data_path, 'rb') as br: dg_list = pickle.load(br) LogInfo.logs('%d Reddit dialogs loaded.' % len(dg_list)) cand_gen = RedditCandidateGenerator(freebase_fp=freebase_path, links_fp=links_path, verbose=args.verbose) output_dir = args.output_prefix + "_%s" % args.mode all_list_fp = output_dir + '/all_list' all_lists = [] for p_idx, post in enumerate(dg_list): LogInfo.begin_track('Entering P %d / %d:', p_idx, len(dg_list)) sub_idx = int(p_idx / 10000) * 10000 index = 'data/%d-%d/%d_schema' % (sub_idx, sub_idx + 9999, p_idx) all_lists.append(index) sub_dir = '%s/data/%d-%d' % (output_dir, sub_idx, sub_idx + 9999) if not os.path.exists(sub_dir): os.makedirs(sub_dir) schema_fp = '%s/%d_schema' % (sub_dir, p_idx) link_fp = '%s/%d_links' % (sub_dir, p_idx) if os.path.isfile(schema_fp): LogInfo.end_track('Skip this post, already saved.') continue cand_gen.single_post_candgen(p_idx=p_idx, post=post, link_fp=link_fp, schema_fp=schema_fp) LogInfo.end_track() with open(all_list_fp, 'w') as fw: for i, idx_str in enumerate(all_lists): if i == len(all_lists) - 1: fw.write(idx_str) else: fw.write(idx_str + '\n')
def main(args): data_path = "%s/simpQ.data.pkl" % args.data_dir freebase_path = "%s/freebase-FB2M.txt" % args.freebase_dir links_path = "%s/SimpQ.all.links" % args.data_dir with open(data_path, 'rb') as br: qa_list = pickle.load(br) LogInfo.logs('%d SimpleQuestions loaded.' % len(qa_list)) cand_gen = SimpleQCandidateGenerator(freebase_fp=freebase_path, links_fp=links_path, verbose=args.verbose) all_list_fp = args.output_dir + '/all_list' all_lists = [] for q_idx, qa in enumerate(qa_list): LogInfo.begin_track('Entering Q %d / %d [%s]:', q_idx, len(qa_list), qa['utterance']) sub_idx = int(q_idx / 1000) * 1000 index = 'data/%d-%d/%d_schema' % (sub_idx, sub_idx + 999, q_idx) all_lists.append(index) sub_dir = '%s/data/%d-%d' % (args.output_dir, sub_idx, sub_idx + 999) if not os.path.exists(sub_dir): os.makedirs(sub_dir) schema_fp = '%s/%d_schema' % (sub_dir, q_idx) link_fp = '%s/%d_links' % (sub_dir, q_idx) if os.path.isfile(schema_fp): LogInfo.end_track('Skip this question, already saved.') continue cand_gen.single_question_candgen(q_idx=q_idx, qa=qa, link_fp=link_fp, schema_fp=schema_fp) LogInfo.end_track() with open(all_list_fp, 'w') as fw: for i, idx_str in enumerate(all_lists): if i == len(all_lists) - 1: fw.write(idx_str) else: fw.write(idx_str + '\n')
def main(args): # ==== Loading Necessary Utils ==== LogInfo.begin_track('Loading Utils ... ') wd_emb_util = WordEmbeddingUtil(emb_dir=args.emb_dir, dim_emb=args.dim_emb) freebase_helper = FreebaseHelper(meta_dir=args.fb_meta_dir) LogInfo.end_track() # ==== Loading Dataset ==== LogInfo.begin_track('Creating Dataset ... ') schema_dataset = SchemaDataset(data_dir=args.data_dir, candgen_dir=args.candgen_dir, schema_level=args.schema_level, freebase_helper=freebase_helper) schema_dataset.load_all_data() active_dicts = schema_dataset.active_dicts qa_list = schema_dataset.qa_list feature_helper = FeatureHelper(active_dicts, qa_list, freebase_helper, path_max_size=args.path_max_size, qw_max_len=args.qw_max_len, pw_max_len=args.pw_max_len, pseq_max_len=args.pseq_max_len) ds_builder = SchemaBuilder(schema_dataset=schema_dataset, feature_helper=feature_helper, neg_f1_ths=args.neg_f1_ths, neg_max_sample=args.neg_max_sample, neg_strategy=args.neg_strategy) LogInfo.end_track() # ==== Building Model ==== LogInfo.begin_track('Building Model and Session ... ') model_config = { 'qw_max_len': args.qw_max_len, 'pw_max_len': args.pw_max_len, 'path_max_size': args.path_max_size, 'pseq_max_len': args.pseq_max_len, 'dim_emb': args.dim_emb, 'w_emb_fix': args.w_emb_fix, 'n_words': args.n_words, 'n_mids': args.n_mids, 'n_paths': args.n_paths, 'drop_rate': args.drop_rate, 'rnn_config': { 'cell_class': args.cell_class, 'num_units': args.num_units, 'num_layers': args.num_layers }, 'att_config': { 'att_func': args.att_func, 'dim_att_hidden': args.dim_att_hidden }, 'path_usage': args.path_usage, 'sent_usage': args.sent_usage, 'seq_merge_mode': args.seq_merge_mode, 'scoring_mode': args.scoring_mode, 'final_func': args.final_func, 'loss_margin': args.loss_margin, 'optm_name': args.optm_name, 'learning_rate': args.lr_rate } if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) with open("%s/config.json" % args.output_dir, 'w') as fw: json.dump(model_config, fw) kbqa_model = KbqaModel(**model_config) LogInfo.logs('Showing final parameters: ') for var in tf.global_variables(): LogInfo.logs('%s: %s', var.name, var.get_shape().as_list()) LogInfo.end_track() # ==== Focused on specific params ==== if args.final_func == 'bilinear': focus_param_name_list = ['rm_task/rm_forward/bilinear_mat'] else: # mlp focus_param_name_list = [ 'rm_task/rm_forward/fc1/weights', 'rm_task/rm_forward/fc1/biases', 'rm_task/rm_forward/fc2/weights', 'rm_task/rm_forward/fc2/biases' ] focus_param_list = [] with tf.variable_scope('', reuse=tf.AUTO_REUSE): for param_name in focus_param_name_list: try: var = tf.get_variable(name=param_name) focus_param_list.append(var) except ValueError: LogInfo.logs("ValueError occured for %s!" % param_name) pass LogInfo.begin_track('Showing %d concern parameters: ', len(focus_param_list)) for name, tensor in zip(focus_param_name_list, focus_param_list): LogInfo.logs('%s --> %s', name, tensor.get_shape().as_list()) LogInfo.end_track() # ==== Initializing model ==== saver = tf.train.Saver() gpu_options = tf.GPUOptions( allow_growth=True, per_process_gpu_memory_fraction=args.gpu_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, intra_op_parallelism_threads=8)) LogInfo.begin_track('Running global_variables_initializer ...') start_epoch = 0 best_valid_f1 = 0. resume_flag = False model_dir = None if args.resume_model_name not in ('', 'None'): model_dir = '%s/%s' % (args.output_dir, args.resume_model_name) if os.path.exists(model_dir): resume_flag = True if resume_flag: start_epoch, best_valid_f1 = model_util.load_model(saver=saver, sess=sess, model_dir=model_dir) else: dep_simulate = True if args.dep_simulate == 'True' else False wd_emb_mat = wd_emb_util.produce_active_word_embedding( active_word_dict=schema_dataset.active_dicts['word'], dep_simulate=dep_simulate) pa_emb_mat = np.random.uniform( low=-0.1, high=0.1, size=(model_config['n_paths'], model_config['dim_emb'])).astype('float32') mid_emb_mat = np.random.uniform( low=-0.1, high=0.1, size=(model_config['n_mids'], model_config['dim_emb'])).astype('float32') LogInfo.logs('%s random path embedding created.', pa_emb_mat.shape) LogInfo.logs('%s random mid embedding created.', mid_emb_mat.shape) sess.run(tf.global_variables_initializer(), feed_dict={ kbqa_model.w_embedding_init: wd_emb_mat, kbqa_model.p_embedding_init: pa_emb_mat, kbqa_model.m_embedding_init: mid_emb_mat }) LogInfo.end_track('Model build complete.') # ==== Running optm / eval ==== optimizer = Optimizer(model=kbqa_model, sess=sess) evaluator = Evaluator(model=kbqa_model, sess=sess) optm_data_loader = ds_builder.build_optm_dataloader( optm_batch_size=args.optm_batch_size) eval_data_list = ds_builder.build_eval_dataloader( eval_batch_size=args.eval_batch_size) if not os.path.exists('%s/detail' % args.output_dir): os.mkdir('%s/detail' % args.output_dir) if not os.path.exists('%s/result' % args.output_dir): os.mkdir('%s/result' % args.output_dir) LogInfo.begin_track('Learning start ...') patience = args.max_patience for epoch in range(start_epoch + 1, args.max_epoch + 1): if patience == 0: LogInfo.logs('Early stopping at epoch = %d.', epoch) break update_flag = False disp_item_dict = {'Epoch': epoch} LogInfo.begin_track('Epoch %d / %d', epoch, args.max_epoch) LogInfo.begin_track('Optimizing ... ') optimizer.optimize_all(optm_data_loader=optm_data_loader) LogInfo.logs('loss = %.6f', optimizer.ret_loss) disp_item_dict['rm_loss'] = optimizer.ret_loss LogInfo.end_track() LogInfo.begin_track('Evaluation:') for mark, eval_dl in zip(['train', 'valid', 'test'], eval_data_list): LogInfo.begin_track('Eval-%s ...', mark) disp_key = '%s_F1' % mark detail_fp = '%s/detail/%s.tmp' % (args.output_dir, mark) result_fp = '%s/result/%s.%03d.result' % (args.output_dir, mark, epoch) disp_item_dict[disp_key] = evaluator.evaluate_all( eval_data_loader=eval_dl, detail_fp=detail_fp, result_fp=result_fp) LogInfo.end_track() LogInfo.end_track() # Display & save states (results, details, params) cur_valid_f1 = disp_item_dict['valid_F1'] if cur_valid_f1 > best_valid_f1: best_valid_f1 = cur_valid_f1 update_flag = True patience = args.max_patience save_best_dir = '%s/model_best' % args.output_dir model_util.delete_dir(save_best_dir) model_util.save_model(saver=saver, sess=sess, model_dir=save_best_dir, epoch=epoch, valid_metric=best_valid_f1) else: patience -= 1 LogInfo.logs('Model %s, best valid_F1 = %.6f [patience = %d]', 'updated' if update_flag else 'stayed', cur_valid_f1, patience) disp_item_dict['Status'] = 'UPDATE' if update_flag else str(patience) disp_item_dict['Time'] = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') status_fp = '%s/status.txt' % args.output_dir disp_header_list = model_util.construct_display_header() if epoch == 1: with open(status_fp, 'w') as bw: write_str = ''.join(disp_header_list) bw.write(write_str + '\n') with open(status_fp, 'a') as bw: write_str = '' for item_idx, header in enumerate(disp_header_list): if header.endswith(' ') or header == '\t': write_str += header else: val = disp_item_dict.get(header, '--------') if isinstance(val, float): write_str += '%8.6f' % val else: write_str += str(val) bw.write(write_str + '\n') LogInfo.logs('Output concern parameters ... ') # don't need any feeds, since we focus on parameters param_result_list = sess.run(focus_param_list) param_result_dict = {} for param_name, param_result in zip(focus_param_name_list, param_result_list): param_result_dict[param_name] = param_result with open(args.output_dir + '/detail/param.%03d.pkl' % epoch, 'wb') as bw: pickle.dump(param_result_dict, bw) LogInfo.logs('Concern parameters saved.') if update_flag: with open(args.output_dir + '/detail/param.best.pkl', 'wb') as bw: pickle.dump(param_result_dict, bw) # save the latest details for mode in ['train', 'valid', 'test']: src = '%s/detail/%s.tmp' % (args.output_dir, mode) dest = '%s/detail/%s.best' % (args.output_dir, mode) if os.path.isfile(src): shutil.move(src, dest) LogInfo.end_track() # end of epoch LogInfo.end_track() # end of learning
param_result_list = sess.run(focus_param_list) param_result_dict = {} for param_name, param_result in zip(focus_param_name_list, param_result_list): param_result_dict[param_name] = param_result with open(args.output_dir + '/detail/param.%03d.pkl' % epoch, 'wb') as bw: pickle.dump(param_result_dict, bw) LogInfo.logs('Concern parameters saved.') if update_flag: with open(args.output_dir + '/detail/param.best.pkl', 'wb') as bw: pickle.dump(param_result_dict, bw) # save the latest details for mode in ['train', 'valid', 'test']: src = '%s/detail/%s.tmp' % (args.output_dir, mode) dest = '%s/detail/%s.best' % (args.output_dir, mode) if os.path.isfile(src): shutil.move(src, dest) LogInfo.end_track() # end of epoch LogInfo.end_track() # end of learning if __name__ == '__main__': LogInfo.begin_track('KBQA running ...') _args = parser.parse_args() main(_args) LogInfo.end_track('All Done.')
def _load_mid(self, mid_name_fp, allow_alias=False): LogInfo.begin_track('Loading surface --> mid dictionary from [%s] ...', mid_name_fp) with codecs.open(mid_name_fp, 'r', 'utf-8') as br: scan = 0 while True: line = br.readline() if line is None or line == '': break spt = line.strip().split('\t') if len(spt) < 3: continue mid = spt[0] name = spt[2] surface = name.lower() # save lowercase as searching entrance skip = False # ignore some subjects at certain domain mid_prefix_pos = mid.find('.') if mid_prefix_pos == -1: skip = True else: mid_prefix = mid[:mid_prefix_pos] if mid_prefix in self.skip_domain_set: skip = True if not skip: if spt[1] == 'type.object.name': self.mid_name_dict[mid] = name if spt[1] == 'type.object.name' or allow_alias: self.surface_mid_dict.setdefault(surface, set([])).add(mid) scan += 1 if scan % 100000 == 0: LogInfo.logs('%d lines scanned.', scan) LogInfo.logs('%d lines scanned.', scan) LogInfo.logs('%d <surface, mid_set> loaded.', len(self.surface_mid_dict)) LogInfo.logs('%d <mid, name> loaded.', len(self.mid_name_dict)) LogInfo.end_track()
def single_question_candgen(self, q_idx, qa, link_fp, schema_fp): # =================== Linking first ==================== # if os.path.isfile(link_fp): gather_linkings = [] with codecs.open(link_fp, 'r', 'utf-8') as br: for line in br.readlines(): tup_list = json.loads(line.strip()) ld_dict = {k: v for k, v in tup_list} gather_linkings.append(LinkData(**ld_dict)) LogInfo.logs('Read %d links from file.', len(gather_linkings)) else: gather_linkings = self.q_links_dict.get(q_idx, []) for idx in range(len(gather_linkings)): gather_linkings[idx].gl_pos = idx LogInfo.begin_track('Show %d E links :', len(gather_linkings)) if self.verbose >= 1: for gl in gather_linkings: LogInfo.logs(gl.display()) LogInfo.end_track() # ==================== Save linking results ================ # if not os.path.isfile(link_fp): with codecs.open(link_fp + '.tmp', 'w', 'utf-8') as bw: for gl in gather_linkings: bw.write(json.dumps(gl.serialize()) + '\n') shutil.move(link_fp + '.tmp', link_fp) LogInfo.logs('%d link data save to file.', len(gather_linkings)) # ===================== simple predicate finding ===================== # gold_entity, gold_pred, _ = qa['targetValue'] sc_list = [] for gl_data in gather_linkings: entity = gl_data.value pred_set = self.subj_pred_dict.get(entity, set([])) for pred in pred_set: sc = Schema() sc.hops = 1 sc.aggregate = False sc.main_pred_seq = [pred] sc.raw_paths = [('Main', gl_data, [pred])] sc.ans_size = 1 if entity == gold_entity and pred == gold_pred: sc.f1 = sc.p = sc.r = 1. else: sc.f1 = sc.p = sc.r = 0. sc_list.append(sc) # ==================== Save schema results ================ # # p, r, f1, ans_size, hops, raw_paths, (agg) # raw_paths: (category, gl_pos, gl_mid, pred_seq) with codecs.open(schema_fp + '.tmp', 'w', 'utf-8') as bw: for sc in sc_list: sc_info_dict = { k: getattr(sc, k) for k in ('p', 'r', 'f1', 'ans_size', 'hops') } if sc.aggregate is not None: sc_info_dict['agg'] = sc.aggregate opt_raw_paths = [] for cate, gl, pred_seq in sc.raw_paths: opt_raw_paths.append((cate, gl.gl_pos, gl.value, pred_seq)) sc_info_dict['raw_paths'] = opt_raw_paths bw.write(json.dumps(sc_info_dict) + '\n') shutil.move(schema_fp + '.tmp', schema_fp) LogInfo.logs('%d schemas successfully saved into [%s].', len(sc_list), schema_fp)