cand_gen = SimpleQCandidateGenerator(fb_subset=args.fb_subset, lex_name=lex_name, wd_emb_util=wd_emb_util, vb=args.verbose) for q_idx, qa in enumerate(qa_list): LogInfo.begin_track('Entering Q %d / %d [%s]:', q_idx, len(qa_list), qa['utterance'].encode('utf-8')) sub_idx = q_idx / 1000 * 1000 sub_dir = '%s/data/%d-%d' % (args.output_dir, sub_idx, sub_idx + 999) if not os.path.exists(sub_dir): os.makedirs(sub_dir) opt_sc_fp = '%s/%d_schema' % (sub_dir, q_idx) link_fp = '%s/%d_links' % (sub_dir, q_idx) if os.path.isfile(opt_sc_fp): LogInfo.end_track('Skip this question, already saved.') continue Tt.start('single_q') cand_gen.single_question_candgen(q_idx=q_idx, qa=qa, link_fp=link_fp, opt_sc_fp=opt_sc_fp) Tt.record('single_q') LogInfo.end_track() # End of Q if __name__ == '__main__': LogInfo.begin_track('[kangqi.task.compQA.candgen_acl18.simpq_candgen] ... ') _args = parser.parse_args() main(_args) LogInfo.end_track('All Done.') Tt.display()
def main(args): LogInfo.begin_track('Learning starts ... ') # ==== Loading Necessary Util ==== # LogInfo.begin_track('Loading Utils ... ') wd_emb_util = WordEmbeddingUtil(wd_emb=args.word_emb, dim_emb=args.dim_emb) LogInfo.end_track() # ==== Loading Dataset ==== # data_config = literal_eval(args.data_config) # including data_name, dir, max_length and others data_config['wd_emb_util'] = wd_emb_util # data_config['kb_emb_util'] = kb_emb_util data_config['verbose'] = args.verbose dataset = QScDataset(**data_config) dataset.load_size() # load size info # ==== Build Model First ==== # LogInfo.begin_track('Building Model and Session ... ') gpu_options = tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=args.gpu_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, intra_op_parallelism_threads=8)) rm_config = literal_eval(args.rm_config) # Relation Matching rm_name = rm_config['name'] del rm_config['name'] assert rm_name in ('Compact', 'Separated') rm_config['n_words'] = dataset.word_size rm_config['n_mids'] = dataset.mid_size rm_config['dim_emb'] = args.dim_emb rm_config['q_max_len'] = dataset.q_max_len rm_config['sc_max_len'] = dataset.sc_max_len rm_config['path_max_len'] = dataset.path_max_len rm_config['pword_max_len'] = dataset.path_max_len * dataset.item_max_len rm_config['verbose'] = args.verbose if rm_name == 'Compact': LogInfo.logs('RelationMatchingKernel: Compact') rm_kernel = CompactRelationMatchingKernel(**rm_config) else: LogInfo.logs('RelationMatchingKernel: Separated') rm_kernel = SeparatedRelationMatchingKernel(**rm_config) el_kernel = EntityLinkingKernel( e_max_size=dataset.e_max_size, e_feat_len=dataset.e_feat_len, verbose=args.verbose) model_config = literal_eval(args.model_config) model_config['sess'] = sess model_config['objective'] = args.eval_mode # relation_only / normal model_config['relation_kernel'] = rm_kernel model_config['entity_kernel'] = el_kernel model_config['extra_len'] = dataset.extra_len model_config['verbose'] = args.verbose compq_model = CompqModel(**model_config) LogInfo.begin_track('Showing final parameters: ') for var in tf.global_variables(): LogInfo.logs('%s: %s', var.name, var.get_shape().as_list()) LogInfo.end_track() saver = tf.train.Saver() LogInfo.begin_track('Parameter initializing ... ') start_epoch = 0 best_valid_f1 = 0. resume_flag = False model_dir = None if args.resume_model_name not in ('', 'None'): model_dir = '%s/%s' % (args.output_dir, args.resume_model_name) if os.path.exists(model_dir): resume_flag = True if resume_flag: start_epoch, best_valid_f1 = load_model(saver=saver, sess=sess, model_dir=model_dir) else: dataset.load_init_emb() # loading parameters for embedding initialize LogInfo.logs('Running global_variables_initializer ...') sess.run(tf.global_variables_initializer(), feed_dict={rm_kernel.w_embedding_init: dataset.word_init_emb, rm_kernel.m_embedding_init: dataset.mid_init_emb}) LogInfo.end_track('Start Epoch = %d', start_epoch) LogInfo.end_track('Model Built.') tf.get_default_graph().finalize() # ==== Constructing Data_Loader ==== # LogInfo.begin_track('Creating DataLoader ... ') dataset.load_cands() # first loading all the candidates if args.eval_mode == 'relation_only': ro_change = 0 for cand_list in dataset.q_cand_dict.values(): ro_change += add_relation_only_metric(cand_list) # for "RelationOnly" evaluation LogInfo.logs('RelationOnly F1 change: %d schemas affected.', ro_change) optm_dl_config = {'dataset': dataset, 'mode': 'train', 'batch_size': args.optm_batch_size, 'proc_ob_num': 5000, 'verbose': args.verbose} eval_dl_config = dict(optm_dl_config) spt = args.dl_neg_mode.split('-') # Neg-${POOR_CONTRIB}-${POOR_MAX_SAMPLE} optm_dl_config['poor_contribution'] = int(spt[1]) optm_dl_config['poor_max_sample'] = int(spt[2]) optm_dl_config['shuffle'] = False optm_train_data = CompqPairDataLoader(**optm_dl_config) eval_dl_config['batch_size'] = args.eval_batch_size eval_data_group = [] for mode in ('train', 'valid', 'test'): eval_dl_config['mode'] = mode eval_data = CompqSingleDataLoader(**eval_dl_config) eval_data.renew_data_list() eval_data_group.append(eval_data) (eval_train_data, eval_valid_data, eval_test_data) = eval_data_group LogInfo.end_track() # End of loading data & dataset # ==== Free memories ==== # for item in (wd_emb_util, dataset.wd_emb_util, data_config): del item # ==== Ready for learning ==== # LogInfo.begin_track('Learning start ... ') output_dir = args.output_dir if not os.path.exists(output_dir + '/detail'): os.makedirs(output_dir + '/detail') if not os.path.exists(output_dir + '/result'): os.makedirs(output_dir + '/result') if os.path.isdir(output_dir + '/TB'): shutil.rmtree(output_dir + '/TB') tf.summary.FileWriter(output_dir + '/TB/optm', sess.graph) # saving model graph information # optm_summary_writer = tf.summary.FileWriter(output_dir + '/TB/optm', sess.graph) # eval_train_summary_writer = tf.summary.FileWriter(output_dir + '/TB/eval_train', sess.graph) # eval_valid_summary_writer = tf.summary.FileWriter(output_dir + '/TB/eval_valid', sess.graph) # eval_test_summary_writer = tf.summary.FileWriter(output_dir + '/TB/eval_test', sess.graph) # LogInfo.logs('TensorBoard writer defined.') # TensorBoard imformation status_fp = output_dir + '/status.csv' with open(status_fp, 'a') as bw: bw.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( 'Epoch', 'T_loss', 'T_F1', 'v_F1', 'Status', 't_F1', datetime.now().strftime('%Y-%m-%d_%H:%M:%S') )) patience = args.max_patience for epoch in range(start_epoch + 1, args.max_epoch + 1): if patience == 0: LogInfo.logs('Early stopping at epoch = %d.', epoch) break LogInfo.begin_track('Epoch %d / %d', epoch, args.max_epoch) update_flag = False LogInfo.begin_track('Optimizing ...') train_loss = compq_model.optimize(optm_train_data, epoch, ob_batch_num=1) LogInfo.end_track('T_loss = %.6f', train_loss) LogInfo.begin_track('Eval-Training ...') train_f1 = compq_model.evaluate(eval_train_data, epoch, ob_batch_num=50, detail_fp=output_dir + '/detail/train_%03d.txt' % epoch) LogInfo.end_track('T_F1 = %.6f', train_f1) LogInfo.begin_track('Eval-Validating ...') valid_f1 = compq_model.evaluate(eval_valid_data, epoch, ob_batch_num=50, detail_fp=output_dir + '/detail/valid_%03d.txt' % epoch) LogInfo.logs('v_F1 = %.6f', valid_f1) if valid_f1 > best_valid_f1: best_valid_f1 = valid_f1 update_flag = True patience = args.max_patience else: patience -= 1 LogInfo.logs('Model %s, best v_F1 = %.6f [patience = %d]', 'updated' if update_flag else 'stayed', valid_f1, patience) LogInfo.end_track() LogInfo.begin_track('Eval-Testing ... ') test_f1 = compq_model.evaluate(eval_test_data, epoch, ob_batch_num=20, detail_fp=output_dir + '/detail/test_%03d.txt' % epoch, result_fp=output_dir + '/result/test_schema_%03d.txt' % epoch) LogInfo.end_track('t_F1 = %.6f', test_f1) with open(status_fp, 'a') as bw: bw.write('%d\t%8.6f\t%8.6f\t%8.6f\t%s\t%8.6f\t%s\n' % ( epoch, train_loss, train_f1, valid_f1, 'UPDATE' if update_flag else str(patience), test_f1, datetime.now().strftime('%Y-%m-%d_%H:%M:%S') )) save_epoch_dir = '%s/model_epoch_%d' % (output_dir, epoch) save_best_dir = '%s/model_best' % output_dir if args.save_epoch: delete_dir(save_epoch_dir) save_model(saver=saver, sess=sess, model_dir=save_epoch_dir, epoch=epoch, valid_metric=valid_f1) if update_flag and args.save_best: # just create a symbolic link delete_dir(save_best_dir) os.symlink(save_epoch_dir, save_best_dir) # symlink at directory level elif update_flag and args.save_best: delete_dir(save_best_dir) save_model(saver=saver, sess=sess, model_dir=save_best_dir, epoch=epoch, valid_metric=valid_f1) LogInfo.end_track() # End of epoch LogInfo.end_track() # End of learning Tt.display() LogInfo.end_track('All Done.')