예제 #1
0
def predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, epoch,
                    steps):
    test_sets = args.test_set.split(',')
    save_dirs = args.test_save.split(',')
    assert len(test_sets) == len(
        save_dirs
    ), 'number of test_sets & test_save not match, got %d vs %d' % (
        len(test_sets), len(save_dirs))

    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
    for test_f, save_f in zip(test_sets, save_dirs):
        test_pyreader.set_batch_generator(
            reader.data_generator(test_f,
                                  batch_size=batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        save_path = save_f + '.' + str(epoch) + '.' + str(steps)
        log.info("testing {}, save to {}".format(test_f, save_path))
        res = predict(exe, test_prog, test_pyreader, graph_vars, dev_count=1)
        save_dir = os.path.dirname(save_path)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        tokenizer = reader.tokenizer
        rev_label_map = {v: k for k, v in six.iteritems(reader.label_map)}
        with open(save_path, 'w', encoding='utf8') as f:
            for id, s, p in res:
                id = ' '.join(tokenizer.convert_ids_to_tokens(id))
                p = ' '.join(['%.5f' % pp[ss] for ss, pp in zip(s, p)])
                s = ' '.join([rev_label_map[ss] for ss in s])
                f.write('{}\t{}\t{}\n'.format(id, s, p))
def predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, epoch,
                    steps):
    """predict_wrapper"""
    def label_pred_2_ori(pred_label, ori_2_new_index):
        """label_pred_2_ori"""
        new_label = [u"O"] * len(ori_2_new_index)
        new_index = []
        for k, v in ori_2_new_index.items():
            if v == -1:
                new_index.append(k)
            elif v < len(pred_label):
                new_label[k] = pred_label[v]
        for index in new_index:
            if index == 0 or new_label[index - 1] == u"O" or index == (
                    len(new_label) - 1):
                new_label[index] = u"O"
            else:
                if new_label[index + 1] == u"O":
                    new_label[index] = u"O"
                else:
                    new_label[index] = u"I-{}".format(new_label[index - 1][2:])
        return new_label

    def get_pred_text(tokens, labels):
        """get_pred_text"""
        start, end, role_type = -1, -1, u""
        ret = []
        for i, lb in enumerate(labels):
            if lb == u"O" and start == -1 and end == -1:
                continue
            elif lb == u"O" and start > -1 and end > -1:
                ret.append({
                    "role_type": role_type,
                    "start": start,
                    "end": end,
                    "text": u"".join(tokens[start:end + 1])
                })
                start, end, role_type = -1, -1, u""
            else:
                if start == -1:
                    start, end, role_type = i, i, lb[2:]
                elif lb.startswith(u"B-"):
                    if start > -1 and end > -1:
                        ret.append({
                            "role_type": role_type,
                            "start": start,
                            "end": end,
                            "text": u"".join(tokens[start:end + 1])
                        })
                        start, end, role_type = i, i, lb[2:]
                    else:
                        start, end, role_type = i, i, lb[2:]
                elif lb[2:] == role_type:
                    end = i
                else:
                    ret.append({
                        "role_type": role_type,
                        "start": start,
                        "end": end,
                        "text": u"".join(tokens[start:end + 1])
                    })
                    start, end, role_type = i, i, lb[2:]

        if start >= 0 and end >= 0:
            ret.append({
                "role_type": role_type,
                "start": start,
                "end": end,
                "text": u"".join(tokens[start:end + 1])
            })
        return ret

    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
    test_pyreader.decorate_tensor_provider(
        reader.data_generator(args.test_set,
                              batch_size=batch_size,
                              epoch=1,
                              dev_count=1,
                              shuffle=False))

    examples = reader.get_examples_by_file(args.test_set)

    res = predict(exe, test_prog, test_pyreader, graph_vars, dev_count=1)
    tokenizer = reader.tokenizer
    rev_label_map = {v: k for k, v in six.iteritems(reader.label_map)}
    output = []
    print(u"examples {} res {}".format(len(examples), len(res)))

    for example, r in zip(examples, res):
        _id, s = r
        pred_tokens = tokenizer.convert_ids_to_tokens(_id)
        pred_label = [rev_label_map[ss] for ss in s]
        new_label = label_pred_2_ori(pred_label, example.ori_2_new_index)
        pred_ret = get_pred_text(pred_tokens, pred_label)
        pred_2_new_ret = get_pred_text(example.ori_text, new_label)
        output.append(
            json.dumps(
                {
                    "event_id": example.id,
                    "pred_tokens": pred_tokens,
                    "pred_labels": pred_label,
                    "tokens": example.ori_text,
                    "labels": new_label,
                    "sentence": example.sentence,
                    "pred_roles_ret": pred_ret,
                    "roles_ret": pred_2_new_ret
                },
                ensure_ascii=False))
    return output
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.SequenceLabelReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    loss_scaling=args.loss_scaling)

                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name, graph_vars["labels"].name,
                        graph_vars["infers"].name, graph_vars["seq_lens"].name
                    ])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            main_program=test_prog,
            share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        if args.save_log and args.log_path:
            if os.path.exists(args.log_path):
                raise FileExistsError("Logging file already exists!")
            with open(args.log_path, 'w') as logfile:
                logfile.write('%s\n' % time.asctime())
            print('Writing logs into %s' % args.log_path)

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(train_exe, train_program, train_pyreader,
                                       graph_vars, args.num_labels, "train",
                                       dev_count)
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["lr"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                          % (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"], outputs["f1"],
                             outputs["precision"], outputs["recall"],
                             args.skip_steps / used_time))

                    if args.save_log and args.log_path:
                        with open(args.log_path, 'a') as logfile:
                            logfile.write("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                                          "f1: %f, precision: %f, recall: %f\n" % (
                                              current_epoch, current_example, num_train_examples,
                                              steps, outputs["loss"], outputs["f1"],
                                              outputs["precision"], outputs["recall"]))

                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.dev_set,
                                batch_size=args.batch_size,
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader, graph_vars,
                                 args.num_labels, "dev")
                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.test_set,
                                batch_size=args.batch_size,
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader, graph_vars,
                                 args.num_labels, "test")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.dev_set,
                batch_size=args.batch_size,
                epoch=1,
                shuffle=False))
        print("Final validation result:")
        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
                 "dev")
        if args.do_predict:
            print("Saving predicted results...")
            predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config,
                    "test", output_dir="./predicted_results")

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.test_set,
                batch_size=args.batch_size,
                epoch=1,
                shuffle=False))
        print("Final test result:")
        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
                 "test")
        if args.do_predict:
            print("Saving predicted results...")
            predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config,
                    "test", output_dir="./predicted_results")