示例#1
0
    cand_gen = SimpleQCandidateGenerator(fb_subset=args.fb_subset,
                                         lex_name=lex_name,
                                         wd_emb_util=wd_emb_util,
                                         vb=args.verbose)

    for q_idx, qa in enumerate(qa_list):
        LogInfo.begin_track('Entering Q %d / %d [%s]:',
                            q_idx, len(qa_list), qa['utterance'].encode('utf-8'))
        sub_idx = q_idx / 1000 * 1000
        sub_dir = '%s/data/%d-%d' % (args.output_dir, sub_idx, sub_idx + 999)
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        opt_sc_fp = '%s/%d_schema' % (sub_dir, q_idx)
        link_fp = '%s/%d_links' % (sub_dir, q_idx)
        if os.path.isfile(opt_sc_fp):
            LogInfo.end_track('Skip this question, already saved.')
            continue
        Tt.start('single_q')
        cand_gen.single_question_candgen(q_idx=q_idx, qa=qa,
                                         link_fp=link_fp, opt_sc_fp=opt_sc_fp)
        Tt.record('single_q')
        LogInfo.end_track()     # End of Q


if __name__ == '__main__':
    LogInfo.begin_track('[kangqi.task.compQA.candgen_acl18.simpq_candgen] ... ')
    _args = parser.parse_args()
    main(_args)
    LogInfo.end_track('All Done.')
    Tt.display()
示例#2
0
def main(args):
    LogInfo.begin_track('Learning starts ... ')

    # ==== Loading Necessary Util ==== #
    LogInfo.begin_track('Loading Utils ... ')
    wd_emb_util = WordEmbeddingUtil(wd_emb=args.word_emb, dim_emb=args.dim_emb)
    LogInfo.end_track()

    # ==== Loading Dataset ==== #
    data_config = literal_eval(args.data_config)    # including data_name, dir, max_length and others
    data_config['wd_emb_util'] = wd_emb_util
    # data_config['kb_emb_util'] = kb_emb_util
    data_config['verbose'] = args.verbose
    dataset = QScDataset(**data_config)
    dataset.load_size()  # load size info

    # ==== Build Model First ==== #
    LogInfo.begin_track('Building Model and Session ... ')
    gpu_options = tf.GPUOptions(allow_growth=True,
                                per_process_gpu_memory_fraction=args.gpu_fraction)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                            intra_op_parallelism_threads=8))
    rm_config = literal_eval(args.rm_config)    # Relation Matching
    rm_name = rm_config['name']
    del rm_config['name']
    assert rm_name in ('Compact', 'Separated')

    rm_config['n_words'] = dataset.word_size
    rm_config['n_mids'] = dataset.mid_size
    rm_config['dim_emb'] = args.dim_emb
    rm_config['q_max_len'] = dataset.q_max_len
    rm_config['sc_max_len'] = dataset.sc_max_len
    rm_config['path_max_len'] = dataset.path_max_len
    rm_config['pword_max_len'] = dataset.path_max_len * dataset.item_max_len
    rm_config['verbose'] = args.verbose
    if rm_name == 'Compact':
        LogInfo.logs('RelationMatchingKernel: Compact')
        rm_kernel = CompactRelationMatchingKernel(**rm_config)
    else:
        LogInfo.logs('RelationMatchingKernel: Separated')
        rm_kernel = SeparatedRelationMatchingKernel(**rm_config)
    el_kernel = EntityLinkingKernel(
        e_max_size=dataset.e_max_size, e_feat_len=dataset.e_feat_len, verbose=args.verbose)

    model_config = literal_eval(args.model_config)
    model_config['sess'] = sess
    model_config['objective'] = args.eval_mode      # relation_only / normal
    model_config['relation_kernel'] = rm_kernel
    model_config['entity_kernel'] = el_kernel
    model_config['extra_len'] = dataset.extra_len
    model_config['verbose'] = args.verbose
    compq_model = CompqModel(**model_config)

    LogInfo.begin_track('Showing final parameters: ')
    for var in tf.global_variables():
        LogInfo.logs('%s: %s', var.name, var.get_shape().as_list())
    LogInfo.end_track()
    saver = tf.train.Saver()

    LogInfo.begin_track('Parameter initializing ... ')
    start_epoch = 0
    best_valid_f1 = 0.
    resume_flag = False
    model_dir = None
    if args.resume_model_name not in ('', 'None'):
        model_dir = '%s/%s' % (args.output_dir, args.resume_model_name)
        if os.path.exists(model_dir):
            resume_flag = True
    if resume_flag:
        start_epoch, best_valid_f1 = load_model(saver=saver, sess=sess, model_dir=model_dir)
    else:
        dataset.load_init_emb()  # loading parameters for embedding initialize
        LogInfo.logs('Running global_variables_initializer ...')
        sess.run(tf.global_variables_initializer(),
                 feed_dict={rm_kernel.w_embedding_init: dataset.word_init_emb,
                            rm_kernel.m_embedding_init: dataset.mid_init_emb})
    LogInfo.end_track('Start Epoch = %d', start_epoch)
    LogInfo.end_track('Model Built.')
    tf.get_default_graph().finalize()

    # ==== Constructing Data_Loader ==== #
    LogInfo.begin_track('Creating DataLoader ... ')
    dataset.load_cands()  # first loading all the candidates
    if args.eval_mode == 'relation_only':
        ro_change = 0
        for cand_list in dataset.q_cand_dict.values():
            ro_change += add_relation_only_metric(cand_list)    # for "RelationOnly" evaluation
        LogInfo.logs('RelationOnly F1 change: %d schemas affected.', ro_change)

    optm_dl_config = {'dataset': dataset, 'mode': 'train',
                      'batch_size': args.optm_batch_size, 'proc_ob_num': 5000, 'verbose': args.verbose}
    eval_dl_config = dict(optm_dl_config)
    spt = args.dl_neg_mode.split('-')       # Neg-${POOR_CONTRIB}-${POOR_MAX_SAMPLE}
    optm_dl_config['poor_contribution'] = int(spt[1])
    optm_dl_config['poor_max_sample'] = int(spt[2])
    optm_dl_config['shuffle'] = False
    optm_train_data = CompqPairDataLoader(**optm_dl_config)

    eval_dl_config['batch_size'] = args.eval_batch_size
    eval_data_group = []
    for mode in ('train', 'valid', 'test'):
        eval_dl_config['mode'] = mode
        eval_data = CompqSingleDataLoader(**eval_dl_config)
        eval_data.renew_data_list()
        eval_data_group.append(eval_data)
    (eval_train_data, eval_valid_data, eval_test_data) = eval_data_group
    LogInfo.end_track()  # End of loading data & dataset

    # ==== Free memories ==== #
    for item in (wd_emb_util, dataset.wd_emb_util, data_config):
        del item

    # ==== Ready for learning ==== #
    LogInfo.begin_track('Learning start ... ')
    output_dir = args.output_dir
    if not os.path.exists(output_dir + '/detail'):
        os.makedirs(output_dir + '/detail')
    if not os.path.exists(output_dir + '/result'):
        os.makedirs(output_dir + '/result')
    if os.path.isdir(output_dir + '/TB'):
        shutil.rmtree(output_dir + '/TB')
    tf.summary.FileWriter(output_dir + '/TB/optm', sess.graph)      # saving model graph information
    # optm_summary_writer = tf.summary.FileWriter(output_dir + '/TB/optm', sess.graph)
    # eval_train_summary_writer = tf.summary.FileWriter(output_dir + '/TB/eval_train', sess.graph)
    # eval_valid_summary_writer = tf.summary.FileWriter(output_dir + '/TB/eval_valid', sess.graph)
    # eval_test_summary_writer = tf.summary.FileWriter(output_dir + '/TB/eval_test', sess.graph)
    # LogInfo.logs('TensorBoard writer defined.')
    # TensorBoard imformation

    status_fp = output_dir + '/status.csv'
    with open(status_fp, 'a') as bw:
        bw.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
            'Epoch', 'T_loss', 'T_F1', 'v_F1', 'Status', 't_F1',
            datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
        ))
    patience = args.max_patience

    for epoch in range(start_epoch + 1, args.max_epoch + 1):
        if patience == 0:
            LogInfo.logs('Early stopping at epoch = %d.', epoch)
            break

        LogInfo.begin_track('Epoch %d / %d', epoch, args.max_epoch)
        update_flag = False

        LogInfo.begin_track('Optimizing ...')
        train_loss = compq_model.optimize(optm_train_data, epoch, ob_batch_num=1)
        LogInfo.end_track('T_loss = %.6f', train_loss)

        LogInfo.begin_track('Eval-Training ...')
        train_f1 = compq_model.evaluate(eval_train_data, epoch, ob_batch_num=50,
                                        detail_fp=output_dir + '/detail/train_%03d.txt' % epoch)
        LogInfo.end_track('T_F1 = %.6f', train_f1)

        LogInfo.begin_track('Eval-Validating ...')
        valid_f1 = compq_model.evaluate(eval_valid_data, epoch, ob_batch_num=50,
                                        detail_fp=output_dir + '/detail/valid_%03d.txt' % epoch)
        LogInfo.logs('v_F1 = %.6f', valid_f1)
        if valid_f1 > best_valid_f1:
            best_valid_f1 = valid_f1
            update_flag = True
            patience = args.max_patience
        else:
            patience -= 1
        LogInfo.logs('Model %s, best v_F1 = %.6f [patience = %d]',
                     'updated' if update_flag else 'stayed',
                     valid_f1,
                     patience)
        LogInfo.end_track()

        LogInfo.begin_track('Eval-Testing ... ')
        test_f1 = compq_model.evaluate(eval_test_data, epoch, ob_batch_num=20,
                                       detail_fp=output_dir + '/detail/test_%03d.txt' % epoch,
                                       result_fp=output_dir + '/result/test_schema_%03d.txt' % epoch)
        LogInfo.end_track('t_F1 = %.6f', test_f1)

        with open(status_fp, 'a') as bw:
            bw.write('%d\t%8.6f\t%8.6f\t%8.6f\t%s\t%8.6f\t%s\n' % (
                epoch, train_loss, train_f1, valid_f1,
                'UPDATE' if update_flag else str(patience), test_f1,
                datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
            ))
        save_epoch_dir = '%s/model_epoch_%d' % (output_dir, epoch)
        save_best_dir = '%s/model_best' % output_dir
        if args.save_epoch:
            delete_dir(save_epoch_dir)
            save_model(saver=saver, sess=sess, model_dir=save_epoch_dir, epoch=epoch, valid_metric=valid_f1)
            if update_flag and args.save_best:  # just create a symbolic link
                delete_dir(save_best_dir)
                os.symlink(save_epoch_dir, save_best_dir)  # symlink at directory level
        elif update_flag and args.save_best:
            delete_dir(save_best_dir)
            save_model(saver=saver, sess=sess, model_dir=save_best_dir, epoch=epoch, valid_metric=valid_f1)

        LogInfo.end_track()  # End of epoch
    LogInfo.end_track()  # End of learning

    Tt.display()
    LogInfo.end_track('All Done.')