def train(args, exe, train_prog, agent, train_data_list, epoch_id):
    collate_fn = MoleculeCollateFunc(
        agent.graph_wrapper,
        task_type='cls',
        with_graph_label=False,  # for unsupervised learning
        with_pos_neg_mask=True)
    data_loader = Dataloader(train_data_list,
                             batch_size=args.batch_size,
                             num_workers=args.num_workers,
                             shuffle=True,
                             collate_fn=collate_fn)

    total_data, trained_data = len(train_data_list), 0
    list_loss = []
    for batch_id, feed_dict in enumerate(data_loader):
        train_loss = exe.run(train_prog,
                             feed=feed_dict,
                             fetch_list=[agent.loss])
        train_loss = np.array(train_loss).mean()
        list_loss.append(train_loss)
        trained_data += feed_dict['graph/num_graph'][0]

        if batch_id % args.log_interval == 0:
            logging.info('Epoch %d [%d/%d] train/loss:%f' % \
                         (epoch_id, trained_data, total_data, train_loss))

    if not args.is_fleet or fleet.worker_index() == 0:
        logging.info('Epoch %d train/loss:%f' % (epoch_id, np.mean(list_loss)))
        sys.stdout.flush()
Пример #2
0
    def run_pipeline_trainer(self, args):
        self.lr = args.lr

        dist_strategy = DistributedStrategy()
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader = \
            self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)

        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
        eprint(type(self).__name__, "device_id: %d." % device_id)
        place = fluid.CUDAPlace(device_id)

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        eprint(type(self).__name__, "run worker startup program done.")

        data_loader.set_sample_list_generator(train_reader, place)
        data_loader.start()
        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
            loss = loss[0] if loss else None
            out_losses.append(loss)
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))

        if args.save_model:
            model_save_dir = "/tmp"
            if fleet.worker_index() == 0:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer")
            else:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables_2")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables_2")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer_2")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer_2")
            fluid.io.save_persistables(exe, model_save_dir_fluid,
                                       fleet._origin_program)
            fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
            feeded_var_names = [var.name for var in feed_var_list]
            fluid.io.save_inference_model(infer_save_dir_fluid,
                                          feeded_var_names, [avg_cost], exe,
                                          fleet._origin_program)
            fleet.save_inference_model(exe, infer_save_dir_fleet,
                                       feeded_var_names, [avg_cost])
Пример #3
0
def default_exe_params(is_distributed, use_cuda, thread_num):
    """
    Set the default execute parameters.
    """
    gpu_id = 0
    trainer_num = 1
    trainer_id = 0
    dist_strategy = None
    places = None
    if is_distributed:
        if use_cuda:
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)

            gpu_id = int(os.getenv("FLAGS_selected_gpus"))
            trainer_num = fleet.worker_num()
            trainer_id = fleet.worker_index()

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.use_experimental_executor = True
            exec_strategy.num_threads = 4
            exec_strategy.num_iteration_per_drop_scope = 1

            dist_strategy = DistributedStrategy()
            dist_strategy.exec_strategy = exec_strategy
            dist_strategy.nccl_comm_num = 2
            dist_strategy.fuse_all_reduce_ops = True

            dist_strategy.forward_recompute = True

            dist_strategy.use_amp = True
            dist_strategy.amp_loss_scaling = 12800.0

            places = fluid.cuda_places()
        else:
            print('Only gpu is supported for distributed mode at present.')
            exit(-1)
    else:
        if use_cuda:
            places = fluid.cuda_places()
        else:
            places = fluid.cpu_places(thread_num)
            os.environ['CPU_NUM'] = str(thread_num)

    if use_cuda:
        exe = fluid.Executor(fluid.CUDAPlace(gpu_id))
    else:
        exe = fluid.Executor(fluid.CPUPlace())

    return {
        'exe': exe,
        'trainer_num': trainer_num,
        'trainer_id': trainer_id,
        'gpu_id': gpu_id,
        'dist_strategy': dist_strategy,
        'places': places
    }
Пример #4
0
 def _wrapper():
     '''
     Sampling according to the worker index to uniformly separate samples.
     '''
     rank = fleet.worker_index()
     nranks = fleet.worker_num()
     for idx, sample in enumerate(generator()):
         if idx % nranks == rank:
             yield sample
Пример #5
0
def train(args):
    """
    Train main function.
    """
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    train_generator = task.reader.data_generator(input_file=args.train_file,
                                                 num_epochs=args.num_epochs,
                                                 num_part=trainers_num,
                                                 part_id=trainer_id,
                                                 phase="train")
    valid_generator = task.reader.data_generator(
        input_file=args.valid_file,
        num_part=dev_count,
        part_id=gpu_id,
        phase="distributed_valid" if args.is_distributed else "valid")

    # run training
    model_timer = Timer()
    for step, data in enumerate(train_generator(), 1):
        model_timer.start()
        metrics = task.train_step(model, data)
        model_timer.pause()
        if step % args.log_steps == 0:
            time_cost = model_timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress(
            )
            print(
                f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                f"step: {step}, time: {time_cost:.3f}, "
                f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print("\tcurrent lr:", metrics.pop('scheduled_lr'))
            print("\t" + task.show_metrics(metrics))
            model_timer.reset()

        if step % args.validation_steps == 0:
            evaluate(task, model, valid_generator, args, dev_count, gpu_id)

        if step % args.save_steps == 0:
            save_path = f"{args.save_path}/step_{step}"
            model.save(save_path, is_checkpoint=True)
Пример #6
0
 def prepare_fleet_paddle_cloud(self, is_fleet):
     """
     :param is_local:
     :return:
     """
     if is_fleet == False:
         self.executor.run(self.startup_program)
     else:
         if fleet.is_worker():
             self.trainer_id = fleet.worker_index()
         if fleet.is_server():
             logging.info("init and run fleet server")
             fleet.init_server()
             fleet.run_server()
         elif fleet.is_worker():
             logging.info("init and run fleet worker")
             fleet.init_worker()
             self.executor.run(self.startup_program)
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    logging.info('Load data ...')
    if len(args.dataset.split(',')) > 1:
        # for large pretraining dataset, ZINC15 and ChEMBL
        # directly load the processed npz files
        train_data_list = []
        for ds in args.dataset.split(','):
            # use processed data.npz
            train_data_list.extend(
                load_data(os.path.join(args.root, ds, 'processed')))
            # dataset = MoleculeDataset(
            #     args.root, ds,
            #     add_symmetry=False,
            #     add_self_loop=False)
            # data_list = dataset.get_data_list()
            # processed_dir = os.path.join(args.root, ds, 'processed')
            # os.makedirs(processed_dir, exist_ok=True)
            # save_data_list_to_npz(
            #     data_list, os.path.join(processed_dir, 'data.npz'))

            # logging.info('Processed {}'.format(ds))
            # train_data_list.extend(data_list)
    else:
        if args.dataset == 'mutag':
            train_data_list, _ = load_mutag_dataset(
                os.path.join(args.root, args.dataset, 'raw'))
        elif args.dataset == 'ptc_mr':
            train_data_list, _ = load_ptc_mr_dataset(
                os.path.join(args.root, args.dataset, 'raw'))
        else:
            raise ValueError('Unsupported dataset')

    if args.is_fleet:
        train_data_list = [
            x for i, x in enumerate(train_data_list)
            if i % fleet.worker_num() == fleet.worker_index()
        ]
    logging.info("Data loaded.")
    logging.info("Train Examples: %s" % len(train_data_list))
    sys.stdout.flush()

    if args.emb_dir is not None:
        os.makedirs(args.emb_dir, exist_ok=True)

    train_prog = F.Program()
    test_prog = F.Program()
    startup_prog = F.Program()
    with F.program_guard(train_prog, startup_prog):
        with F.unique_name.guard():
            agent = create_model(args, config)
            test_prog = train_prog.clone(for_test=True)

            opt = F.optimizer.Adam(learning_rate=args.lr)
            if args.is_fleet:
                dist_strategy = DistributedStrategy()
                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                fleet.init(role)
                opt = fleet.distributed_optimizer(opt, strategy=dist_strategy)
            opt.minimize(agent.loss)

    place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace()
    exe = F.Executor(place)
    exe.run(startup_prog)

    if (not args.dont_save_emb) and \
       (not args.is_fleet or fleet.worker_index() == 0):
        save_embedding(args, exe, test_prog, agent, train_data_list, -1)

    for epoch_id in range(args.max_epoch):
        train(args, exe, train_prog, agent, train_data_list, epoch_id)
        if not args.is_fleet or fleet.worker_index() == 0:
            F.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id),
                             train_prog)
            if not args.dont_save_emb:
                save_embedding(args, exe, test_prog, agent, train_data_list,
                               epoch_id)
Пример #8
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path,
                                      label_map_config=args.label_map_config,
                                      max_seq_len=args.max_seq_len,
                                      total_num=args.train_data_size,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed,
                                      tokenizer=args.tokenizer,
                                      for_cn=args.for_cn,
                                      task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.predict_batch_size == None:
        args.predict_batch_size = args.batch_size

    if args.do_train:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        dev_count = fleet.worker_num()

        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=1,
            trainer_id=fleet.worker_index(),
            trainer_num=fleet.worker_num(),
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        # use fleet api
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        if args.is_distributed:
            exec_strategy.num_threads = 3

        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 1
        if args.is_distributed:
            dist_strategy.nccl_comm_num = 2
        dist_strategy.use_hierarchical_allreduce = True

        if args.use_mix_precision:
            dist_strategy.use_amp = True

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio,
                    dist_strategy=dist_strategy)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_prediction=True)

        test_prog = test_prog.clone(for_test=True)

    train_program = fleet.main_program

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        train_exe = exe
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    #    if args.do_val or args.do_test:
    #        if args.use_multi_gpu_test:
    #            test_exe = fluid.ParallelExecutor(
    #                use_cuda=args.use_cuda,
    #                main_program=test_prog,
    #                share_vars_from=train_exe)

    current_epoch = 0
    steps = 0
    if args.do_train:
        train_pyreader.start()
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        while True:
            try:
                steps += 1
                #                log.info("step: %d" % steps)

                if fleet.worker_index() != 0:
                    train_exe.run(fetch_list=[], program=train_program)
                    continue

                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[], program=train_program)

                else:
                    outputs = evaluate(train_exe,
                                       train_program,
                                       train_pyreader,
                                       graph_vars,
                                       "train",
                                       metric=args.metric)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "ave acc: %f, speed: %f steps/s" %
                        (current_epoch, current_example * dev_count,
                         num_train_examples, steps, outputs["loss"],
                         outputs["accuracy"], args.skip_steps / used_time))
                    ce_info.append(
                        [outputs["loss"], outputs["accuracy"], used_time])

                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path,
                                               fleet._origin_program)


#                if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate_wrapper(args, reader, exe, test_prog,
                                         test_pyreader, graph_vars,
                                         current_epoch, steps)

                    if args.do_test:
                        predict_wrapper(args, reader, exe, test_prog,
                                        test_pyreader, graph_vars,
                                        current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path,
                                           fleet._origin_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
                         graph_vars, current_epoch, steps)

    # final eval on test set
    if args.do_test:
        predict_wrapper(args, reader, exe, test_prog, test_pyreader,
                        graph_vars, current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(args.diagnostic,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        log.info("Final diagnostic")
        qids, preds, probs = predict(test_exe, test_prog, test_pyreader,
                                     graph_vars)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
Пример #9
0
def train(args):
    """
    Train main function.
    """
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    train_generator = task.get_data_loader(model,
                                           input_file=args.train_file,
                                           num_epochs=args.num_epochs,
                                           num_part=trainers_num,
                                           part_id=trainer_id,
                                           phase="train")
    valid_generator = task.get_data_loader(
        model,
        input_file=args.valid_file,
        num_part=dev_count,
        part_id=gpu_id,
        phase="distributed_valid" if args.is_distributed else "valid")

    # run training
    timer = Timer()
    timer.start()
    if args.Model.model == 'NSPModel':
        best_metrics = 0.0
    else:
        best_metrics = 10000
    shuffledatafile()
    for step, data in enumerate(train_generator(), args.start_step + 1):
        outputs = task.train_step(model, data)
        timer.pause()
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress(
            )
            print(
                f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                f"step: {step}, time: {time_cost:.3f}, "
                f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print(f"\tcurrent lr: {outputs.pop('scheduled_lr'):.7f}")
            metrics = task.get_metrics(outputs)
            print("\t" + ", ".join(f"{k}: {v:.4f}"
                                   for k, v in metrics.items()))
            timer.reset()

        if step % args.validation_steps == 0:

            # shuffledatafile()
            metrics = evaluate(task, model, valid_generator, args, dev_count,
                               gpu_id, step)
            if args.Model.model == 'NSPModel' and metrics[
                    'nsp_acc'] > best_metrics:
                best_metrics = metrics['nsp_acc']
                save_path = f"{args.save_path}/step_{step}_{best_metrics}"
                model.save(save_path, is_checkpoint=True)

            elif args.Model.model == 'Plato' and metrics['loss'] < best_metrics:
                best_metrics = metrics['loss']
                save_path = f"{args.save_path}/step_{step}_{best_metrics}"
                model.save(save_path, is_checkpoint=True)
        # if step % args.save_steps == 0 and trainer_id == 0:
        #     save_path = f"{args.save_path}/step_{step}"
        #     model.save(save_path, is_checkpoint=True)
        #     with open(save_path + ".finish", "w") as f:
        #         pass

        timer.start()
Пример #10
0
def train(args):
    """train start"""
    logging.info(args)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)
    dev_count = 1

    exe = fluid.Executor(place)

    train_program = fluid.Program()
    startup_program = fluid.Program()

    # For Distributed Training.
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)
    args.num_trainers = fleet.worker_num()
    args.trainer_id = fleet.worker_index()
    dist_strategy = DistributedStrategy()

    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                ModelHyperParams.bos_idx,
                use_py_reader=args.use_py_reader,
                is_test=False)

            optimizer = None
            if args.sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)

                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate

                optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                                 beta1=TrainTaskConfig.beta1,
                                                 beta2=TrainTaskConfig.beta2,
                                                 epsilon=TrainTaskConfig.eps)
            else:
                optimizer = fluid.optimizer.SGD(0.003)
            if args.use_fp16:
                optimizer = decorate(optimizer,
                                     init_loss_scaling=args.loss_scaling)
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)
            optimizer.minimize(avg_cost, startup_program)

    train_program = fleet.main_program
    orig_train_program = fleet._origin_program
    train_loop(args, exe, train_program, orig_train_program, startup_program,
               dev_count, sum_cost, avg_cost, token_num, predict, pyreader)
Пример #11
0
    def train_and_eval(self):
        """
        Returns: None
        """
        if self.is_fleet and fleet.is_server():
            logging.debug("is fleet.server, over")
            return
        if self.is_fleet:
            logging.debug("worker_index%d start train...." %
                          fleet.worker_index())

        num_train_examples = self.params.get("num_train_examples", 0)
        if num_train_examples == 0:
            num_train_examples = self.data_set_reader.train_reader.get_num_examples(
            )

        self.data_set_reader.train_reader.run()
        self.curr_step = 1
        time_begin = time.time()
        if 'output_path' in self.params and self.params["output_path"]:
            save_checkpoints_path = os.path.join(self.params["output_path"],
                                                 "save_checkpoints")
            save_inference_model_path = os.path.join(
                self.params["output_path"], "save_inference_model")
        else:
            save_checkpoints_path = "./output/save_checkpoints/"
            save_inference_model_path = "./output/save_inference_model/"
        while True:
            try:
                if self.curr_step % self.params["train_log_step"] != 0:
                    self.run(C.TRAINING, need_fetch=False)
                else:
                    metrics_tensor_value = self.run(C.TRAINING,
                                                    need_fetch=True)
                    current_example, self.curr_epoch = self.data_set_reader.train_reader.get_train_progress(
                    )
                    logging.debug(
                        "epoch {%d} progress {%d}/{%d} pyreader queue size {%d}",
                        self.curr_epoch, current_example, num_train_examples,
                        self.data_set_reader.train_reader.paddle_py_reader.
                        queue.size())

                    fetch_output_dict = OrderedDict()
                    for key, value in zip(self.fetch_list_train_key,
                                          metrics_tensor_value):
                        fetch_output_dict[key] = value
                    time_end = time.time()
                    used_time = time_end - time_begin
                    meta_info = OrderedDict()
                    meta_info[C.STEP] = self.curr_step
                    meta_info[C.GPU_ID] = self.gpu_id
                    meta_info[C.TIME_COST] = used_time
                    meta_info["epoch"] = self.curr_epoch

                    metrics_output = self.model_class.get_metrics(
                        fetch_output_dict, meta_info, C.TRAINING)
                    if self.params.get("visualdl_log", False):
                        assert isinstance(
                            metrics_output, OrderedDict
                        ), "metrics_output is must be OrderedDict"
                        self.visualdl_log(metrics_output,
                                          np.mean(fetch_output_dict[C.LOSS]),
                                          self.curr_step,
                                          phase=C.TRAINING)

                if self.trainer_id == 0 and self.curr_step % self.params[
                        "save_model_step"] == 0:
                    self.save_models(save_checkpoints_path,
                                     save_inference_model_path, self.curr_step)
                if self.curr_step % self.params["eval_step"] == 0:
                    if self.params["is_eval_dev"]:
                        self.evaluate(self.data_set_reader.evaluate_reader,
                                      C.EVALUATE, self.curr_step)
                    if self.params["is_eval_test"]:
                        self.evaluate(self.data_set_reader.test_reader, C.TEST,
                                      self.curr_step)
                if self.curr_step % self.params["train_log_step"] == 0:
                    time_begin = time.time()
                self.curr_step += 1
                if "steps_for_test" in self.params and self.curr_step >= self.params[
                        "steps_for_test"]:
                    self.data_set_reader.train_reader.stop()
                    logging.debug("steps_for_test stop!")
                    break
            except fluid.core.EOFException:
                self.data_set_reader.train_reader.stop()
                break
            except Exception as e:
                logging.error('traceback.format_exc(): %s',
                              traceback.format_exc())
                self.save_models(save_checkpoints_path,
                                 save_inference_model_path, self.curr_step)
                raise e
        if self.params["is_eval_dev"]:
            logging.info("Final evaluate result")
            self.evaluate(self.data_set_reader.evaluate_reader, C.EVALUATE,
                          self.curr_step)
        if self.params["is_eval_test"]:
            logging.info("Final test result")
            self.evaluate(self.data_set_reader.test_reader, C.TEST,
                          self.curr_step)

        self.save_models(save_checkpoints_path, save_inference_model_path,
                         self.curr_step)
Пример #12
0
    def do_training(self, fleet, args):
        """
        begin training.
        Args:
            fleet (Collective): Collective inherited base class Fleet
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the value is train losses
        """
        args = parse_args()
        logging.info(args)
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = 1
        exe = fluid.Executor(place)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        args.num_trainers = fleet.worker_num()
        args.trainer_id = fleet.worker_index()
        args.run_params = json.loads(args.run_params)
        dist_strategy = DistributedStrategy()
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        with fluid.program_guard(train_program, startup_program):
            with fluid.unique_name.guard():
                sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                    ModelHyperParams.src_vocab_size,
                    ModelHyperParams.trg_vocab_size,
                    ModelHyperParams.max_length + 1,
                    ModelHyperParams.n_layer,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_key,
                    ModelHyperParams.d_value,
                    ModelHyperParams.d_model,
                    ModelHyperParams.d_inner_hid,
                    ModelHyperParams.prepostprocess_dropout,
                    ModelHyperParams.attention_dropout,
                    ModelHyperParams.relu_dropout,
                    ModelHyperParams.preprocess_cmd,
                    ModelHyperParams.postprocess_cmd,
                    ModelHyperParams.weight_sharing,
                    TrainTaskConfig.label_smooth_eps,
                    ModelHyperParams.bos_idx,
                    use_py_reader=args.use_py_reader,
                    is_test=False)
                optimizer = fluid.optimizer.SGD(0.003)
                if args.run_params["fp16"]:
                    optimizer = decorate(optimizer, init_loss_scaling=64.0)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        strategy=dist_strategy)
                optimizer.minimize(avg_cost, startup_program)
        train_program = fleet.main_program
        exe.run(startup_program)
        train_data = prepare_data_generator(
            args,
            is_test=False,
            count=dev_count,
            pyreader=pyreader,
            py_reader_provider_wrapper=py_reader_provider_wrapper)

        loss_normalizer = -(
            (1. - TrainTaskConfig.label_smooth_eps) * np.log(
                (1. - TrainTaskConfig.label_smooth_eps)) +
            TrainTaskConfig.label_smooth_eps *
            np.log(TrainTaskConfig.label_smooth_eps /
                   (ModelHyperParams.trg_vocab_size - 1) + 1e-20))

        step_idx = 0
        init_flag = True
        result_loss = []
        result_ppl = []
        train_info = []
        for pass_id in six.moves.xrange(args.num_epochs):
            pass_start_time = time.time()
            if args.use_py_reader:
                pyreader.start()
                data_generator = None
            else:
                data_generator = train_data()
            batch_id = 0
            while True:
                try:
                    feed_dict_list = prepare_feed_dict_list(
                        data_generator, init_flag, dev_count)
                    t1 = time.time()
                    outs = exe.run(program=train_program,
                                   fetch_list=[sum_cost.name, token_num.name]
                                   if step_idx % args.fetch_steps == 0 else [],
                                   feed=feed_dict_list)

                    if step_idx % args.fetch_steps == 0:
                        sum_cost_val, token_num_val = np.array(
                            outs[0]), np.array(outs[1])
                        total_sum_cost = sum_cost_val.sum()
                        total_token_num = token_num_val.sum()
                        total_avg_cost = total_sum_cost / total_token_num
                        result_loss.append(total_avg_cost - loss_normalizer)
                        result_ppl.append(
                            np.exp([min(total_avg_cost, 100)]).item(0))
                        train_info.append(result_loss)
                    init_flag = False
                    batch_id += 1
                    step_idx += 1
                    if batch_id >= 5:
                        break
                except (StopIteration, fluid.core.EOFException):
                    if args.use_py_reader:
                        pyreader.reset()
                    break

            train_info = [round(i, 6) for i in train_info[0]]
            return train_info
Пример #13
0
    def train_and_eval(self):
        """
        :return:
        """
        if self.is_fleet and fleet.is_server():
            logging.debug("is fleet.server, over")
            return
        if self.is_fleet:
            logging.debug("worker_index%d start train...." %
                          fleet.worker_index())

        num_train_examples = self.params.get("num_train_examples", 0)
        if num_train_examples == 0:
            num_train_examples = self.data_set_reader.train_reader.get_num_examples(
            )

        self.data_set_reader.train_reader.run()
        steps = 1
        time_begin = time.time()
        if 'output_path' in self.params.keys() and self.params["output_path"]:
            save_checkpoints_path = os.path.join(self.params["output_path"],
                                                 "save_checkpoints")
            save_inference_model_path = os.path.join(
                self.params["output_path"], "save_inference_model")
        else:
            save_checkpoints_path = "./output/save_checkpoints/"
            save_inference_model_path = "./output/save_inference_model/"
        current_epoch = 0
        last_epoch = 0
        dev_score_history = []
        try:
            while True:
                try:
                    current_example, current_epoch = self.data_set_reader.train_reader.get_train_progress(
                    )
                    if (steps % self.params["train_log_step"] != 0 or self.trainer_id != 0 \
                       and current_epoch == last_epoch):
                        self.run(InstanceName.TRAINING, need_fetch=False)
                    else:
                        metrics_tensor_value = self.run(InstanceName.TRAINING,
                                                        need_fetch=True)
                        logging.info(
                            "epoch {0} progress {1}/{2} pyreader queue size {3}"
                            .format(
                                current_epoch, current_example,
                                num_train_examples,
                                self.data_set_reader.train_reader.
                                paddle_py_reader.queue.size()))

                        current_example, current_epoch = self.data_set_reader.train_reader.get_train_progress(
                        )

                        fetch_output_dict = collections.OrderedDict()
                        for key, value in zip(self.fetch_list_train_key,
                                              metrics_tensor_value):
                            fetch_output_dict[key] = value
                        time_end = time.time()
                        used_time = time_end - time_begin
                        meta_info = collections.OrderedDict()
                        meta_info[InstanceName.STEP] = steps
                        meta_info[InstanceName.GPU_ID] = self.gpu_id
                        meta_info[InstanceName.TIME_COST] = used_time

                        metrics_output = self.model_class.get_metrics(
                            fetch_output_dict, meta_info,
                            InstanceName.TRAINING)
                        if self.params.get("visualdl_log", False):
                            assert isinstance(
                                metrics_output, OrderedDict
                            ), "metrics_output is must be OrderedDict"
                            self.visualdl_log(
                                metrics_output,
                                np.mean(fetch_output_dict[InstanceName.LOSS]),
                                steps,
                                phase=InstanceName.TRAINING)
                        time_begin = time.time()

                    if steps % self.params[
                            "eval_step"] == 0 or last_epoch != current_epoch:
                        if self.params["is_eval_dev"]:
                            rets = self.evaluate_iterface(self.data_set_reader.dev_reader, \
                                          InstanceName.EVALUATE, steps, current_epoch)
                            if self.trainer_id == 0:
                                dev_score_history.append(rets[0]['score'])
                        else:
                            rets = None

                        if self.params["is_eval_test"]:
                            self.predict_iterface(self.data_set_reader.test_reader, \
                                          InstanceName.TEST, steps, current_epoch, rets, \
                                          dev_score_history)
                    if self.trainer_id == 0:
                        if steps % self.params["save_model_step"] == 0:
                            self.save_models(save_checkpoints_path,
                                             save_inference_model_path, steps)
                    steps += 1

                    if last_epoch != current_epoch:
                        last_epoch = current_epoch

                    if "steps_for_test" in self.params and steps >= self.params[
                            "steps_for_test"]:
                        self.data_set_reader.train_reader.stop()
                        logging.debug("steps_for_test stop!")
                        break
                except fluid.core.EOFException:
                    self.data_set_reader.train_reader.stop()
                    break
            if self.params["is_eval_dev"]:
                logging.info("Final evaluate result: ")
                rets = self.evaluate_iterface(self.data_set_reader.dev_reader, \
                                     InstanceName.EVALUATE, steps, current_epoch)
                if self.trainer_id == 0:
                    dev_score_history.append(rets[0]['score'])
            else:
                rets = None

            if self.params["is_eval_test"]:
                logging.info("Final test result: ")
                self.predict_iterface(self.data_set_reader.test_reader, InstanceName.TEST, \
                              steps, current_epoch, rets, dev_score_history)
            if self.params.get("diagnostic", False):
                logging.info("Final test on dianostic: ")
                # TODO
        except Exception as e:
            logging.error('traceback.format_exc():%s' % traceback.format_exc())
            self.save_models(save_checkpoints_path, save_inference_model_path,
                             steps)
            raise e

        self.save_models(save_checkpoints_path, save_inference_model_path,
                         steps)
Пример #14
0
from paddle.fluid.incubate.fleet.base import role_maker

input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')

cost = mlp(input_x, input_y)
optimizer = fluid.optimizer.SGD(learning_rate=0.01)

dist_strategy = DistributedStrategy()
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)

optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
optimizer.minimize(cost, fluid.default_startup_program())

train_prog = fleet.main_program

gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
place = fluid.CUDAPlace(gpu_id)

exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())

step = 1001
for i in range(step):
    cost_val = exe.run(program=train_prog,
                       feed=gen_data(),
                       fetch_list=[cost.name])
    print("worker_index: %d, step%d cost = %f" %
          (fleet.worker_index(), i, cost_val[0]))
Пример #15
0
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.

    model_config:
        a json file  with the  model configurations,such as dropout rate ,learning rate,num tasks and so on;

    context_pooling:
        it means the pooling type of context prediction;
    
    PreGNNContextpredModel:
        It is an unsupervised pretraining model which use subgraphs to predict their surrounding graph structures. Our goal is to pre-train a GNN so that it maps nodes appearing in similar structural contexts to nearby embeddings.

    """
    model_config = json.load(open(args.model_config, 'r'))
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate
    model_config['context_pooling'] = args.context_pooling

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward(is_test=True)

    # Use CUDAPlace for GPU training, or use CPUPlace for CPU training.
    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    # PreGNNContextPredFeaturizer:
    #     It is used along with `PreGNNContextPredModel`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch.
    # k is the number of layer,l1 and l2 are the different size of context,usually l1 < l2.
    # splitter:
    #     split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
    #     `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, 
    #     then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set 
    #     and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on 
    #     out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` 
    #     and `IndexSplitter` is also available."
    k = model_config['layer_num']
    l1 = k - 1
    l2 = l1 + args.context_size
    featurizer = PreGNNContextPredFeaturizer(
            model.substruct_graph_wrapper, 
            model.context_graph_wrapper, 
            k, l1, l2)
    dataset = load_zinc_dataset(args.data_path, featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(
            dataset, frac_train=0.9, frac_valid=0, frac_test=0.1)
    if args.distributed:
        indices = list(range(fleet.worker_index(), len(train_dataset), fleet.worker_num()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    # Load the train function and calculate the train loss and test loss in each epoch.
    # Here we set the epoch is in range of max epoch,you can change it if you want. 

    # Then we will calculate the train loss ,test loss and print them.
    # Finally we save the best epoch to the model according to the dataset.
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset, featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmin(list_test_loss)
        fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
        return list_test_loss[best_epoch_id]
Пример #16
0
    def run_gpu_fleet_api_trainer(self, args):
        assert args.update_method == "nccl2"

        self.lr = args.lr

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.fuse_memory_size = 1  # MB
        dist_strategy.fuse_laryer_size = 1
        if args.use_local_sgd:
            dist_strategy.use_local_sgd = True
        if args.ut4grad_allreduce:
            dist_strategy._ut4grad_allreduce = True
        if args.sync_batch_norm:
            dist_strategy.sync_batch_norm = True

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        print_to_err("gpu_fleet", "fleet.node_num:")
        # "fleet.node_id:", fleet.node_id(),
        # "fleet.trainer_num:", fleet.worker_num())

        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)

        trainer_prog = fleet._origin_program
        dist_prog = fleet.main_program

        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
        place = fluid.CUDAPlace(device_id)

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        eprint(type(self).__name__, "run worker startup program done.")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        eprint("feed_var_list:", feed_var_list)

        # tmp add this code to pass python35 gcc8 CI
        # Fixme(gongweibao, wangxi), need fix fleet api program order
        if feed_var_list[0].name == 'label':
            feed_var_list = feed_var_list[::-1]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(dist_prog,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))

        if args.save_model:
            model_save_dir = "/tmp"
            if fleet.worker_index() == 0:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer")
            else:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables_2")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables_2")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer_2")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer_2")
            fluid.io.save_persistables(exe, model_save_dir_fluid,
                                       fleet._origin_program)
            fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
            feeded_var_names = [var.name for var in feed_var_list]
            fluid.io.save_inference_model(infer_save_dir_fluid,
                                          feeded_var_names, [avg_cost], exe,
                                          fleet._origin_program)
            fleet.save_inference_model(exe, infer_save_dir_fleet,
                                       feeded_var_names, [avg_cost])
def main(args):
    cfg = XiaoduHiConfig()
    cfg.scene_sensor_algo = 'yolov4'

    wae_ndarray = np.load(os.path.join(args.wae_dir, 'raw_wae.npy'))
    start_epoch = 0

    train_program = fluid.Program()
    startup_program = fluid.Program()

    with fluid.program_guard(train_program, startup_program):
        attention_ctrl = AttentionController(
            inputs_type=args.inputs_type,
            num_actions=wae_ndarray.shape[0],
            act_tr_dim=wae_ndarray.shape[1],
            act_emb_ndarray=wae_ndarray,
            num_frames=cfg.ob_window_len,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            model_dim=args.model_dim,
            num_decoder_blocks=args.num_decoder_blocks,
            num_heads=args.num_heads,
            ffn_dim=args.ffn_dim,
            dropout=args.dropout,
            normalize_before=args.normalize_before,
            frame_emb_trainable=args.frame_emb_trainable,
            trigger_loss_coef=args.trigger_loss_coef,
            obj_loss_coef=args.obj_loss_coef,
            act_loss_coef=args.act_loss_coef,
            use_last_act_loss=args.use_last_act_loss,
            mode='train')
        preds = attention_ctrl.predict()
        test_program = train_program.clone(for_test=True)

        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=args.lr,
            regularization=fluid.regularizer.L2Decay(
                regularization_coeff=0.1))
        if args.distributed_training:
            optimizer = fleet.distributed_optimizer(optimizer)
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)

        optimizer.minimize(attention_ctrl.loss)

    if args.distributed_training:
        place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
    else:
        place = fluid.CUDAPlace(args.gpu)

    exe = fluid.Executor(place)
    exe.run(startup_program)

    if args.inputs_type.startswith('inst_crop') and \
       args.inputs_type != 'inst_crop_wo_crop':
        fluid.io.load_vars(
            exe, MobileNetV2_Pretrained,
            main_program=train_program,
            predicate=lambda v: os.path.exists(
                os.path.join(MobileNetV2_Pretrained, v.name)))
        print('Loaded weights from {}'.format(MobileNetV2_Pretrained))

    if args.init_params is not None:
        base = os.path.basename(args.init_params)
        if base.startswith('epoch_'):
            start_epoch = int(base[len('epoch_'):]) + 1

        tb_state = os.path.join(args.init_params, 'tb_state.txt')
        if os.path.exists(tb_state):
            global _update_step
            global _eval_step
            with open(tb_state, 'r') as f:
                update_step, eval_step = f.readline().split(' ')
                _update_step = int(update_step)
                _eval_step = int(eval_step)

        fluid.io.load_vars(
            exe, args.init_params,
            main_program=train_program,
            predicate=lambda v: os.path.exists(
                os.path.join(args.init_params, v.name)))
        print('Loaded weights from {}'.format(args.init_params))

    if args.distributed_training:
        train_worker_gpus = [int(os.environ.get('FLAGS_selected_gpus', 0))]
        test_worker_gpus = train_worker_gpus
    else:
        train_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_train)
        test_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_test)

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    if not args.use_decord:
        train_dataloader = XiaoduHiDataloaderv2(
            attention_ctrl.feed_list, [place], args.yolov4_model_dir,
            args.video_tracking_dir, args.train_dataset,
            full_neg_txt=args.full_neg_train,
            batch_size=args.bs,
            num_workers=args.data_workers_for_train,
            worker_gpus=train_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            ob_window_len=cfg.ob_window_len,
            interval=cfg.interval,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            augment=False,
            resample_negs_per_epoch=True)
        test_dataloader = XiaoduHiDataloaderv2(
            attention_ctrl.feed_list, [place], args.yolov4_model_dir,
            args.video_tracking_dir, args.test_dataset,
            full_neg_txt=args.full_neg_test,
            batch_size=args.bs,
            num_workers=args.data_workers_for_test,
            worker_gpus=test_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            ob_window_len=cfg.ob_window_len,
            interval=cfg.interval,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            augment=False,
            resample_negs_per_epoch=False,
            for_test=True)

        test_dataloader.save_to_txt(
            os.path.join(args.save, 'eval_data.txt'), dt=200)
    else:
        train_dataloader = XiaoduHiDecordLoader(
            attention_ctrl.feed_list, [place],
            args.yolov4_model_dir, args.decord_ds_pkl,
            decord_readers=args.decord_readers,
            yolov4_detectors=args.decord_detectors,
            post_workers=args.decord_post_workers,
            batch_size=args.bs,
            detector_gpus=train_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            for_test=False)
        test_dataloader = XiaoduHiDecordLoader(
            attention_ctrl.feed_list, [place],
            args.yolov4_model_dir, args.decord_ds_pkl,
            decord_readers=args.decord_readers,
            yolov4_detectors=args.decord_detectors,
            post_workers=args.decord_post_workers,
            batch_size=args.bs,
            detector_gpus=test_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            for_test=True)

    train_dataloader.start_workers()
    test_dataloader.start_workers()

    train_log = os.path.join(args.save, 'loss.csv')
    eval_log = os.path.join(args.save, 'eval.txt')
    with open(os.path.join(args.save, 'args.txt'), 'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        logdir=os.path.join(args.save, 'logdir'),
        purge_step=None if _update_step == 0 else _update_step)

    worker_index = None if not args.distributed_training \
        else fleet.worker_index()

    # if worker_index == 0:
    #     eval_model(exe, test_program, preds, attention_ctrl.act_loss,
    #                test_dataloader, -1, log_file=eval_log,
    #                tb_writer=tb_writer, worker_index=worker_index)
    for epoch_id in range(start_epoch, args.epochs):
        print('--------------- Epoch %d ---------------' % epoch_id)
        train_epoch(exe, train_program, attention_ctrl, train_dataloader,
                    log_file=train_log, tb_writer=tb_writer,
                    worker_index=worker_index)

        save_dir = os.path.join(args.save, 'epoch_{}'.format(epoch_id))
        shutil.rmtree(save_dir, ignore_errors=True)
        os.mkdir(save_dir)
        fluid.io.save_params(exe, save_dir, main_program=train_program)

        if epoch_id > 0 and epoch_id % args.run_eval_after_epochs == 0:
            eval_model(exe, test_program, preds, attention_ctrl.act_loss,
                       test_dataloader, epoch_id, log_file=eval_log,
                       tb_writer=tb_writer)

        tb_state = os.path.join(save_dir, 'tb_state.txt')
        with open(tb_state, 'w') as f:
            f.write('{} {}'.format(_update_step, _eval_step))

    if epoch_id % args.run_eval_after_epochs != 0:
        eval_model(exe, test_program, preds, attention_ctrl.act_loss,
                   test_dataloader, epoch_id, log_file=eval_log,
                   tb_writer=tb_writer, worker_index=worker_index)

    train_dataloader.stop_workers()
    test_dataloader.stop_workers()
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.

    model_config:
        a json file  with the  model configurations,such as dropout rate ,learning rate,num tasks and so on;
    task_num:
        It means the number of chembl filtered task;
    
    PreGNNSupervisedModel:
        It means the PretrainGNNModel for supervised strategy.
        Graph-level multi-task supervised pre-training to jointly predict a diverse set of supervised labels of individual graphs.
    """
    model_config = json.load(open(args.model_config, 'r'))
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate
    task_num = get_chembl_filtered_task_num()
    model_config['task_num'] = task_num

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNSupervisedModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNSupervisedModel(model_config)
            model.forward(is_test=True)
    """
    Use CUDAPlace for GPU training, or use CPUPlace for CPU training.

    """

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    """
    PreGNNSupervisedFeaturizer:
        It is used along with `PreGNNSupervised`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch.
        
    splitter:
        split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
        `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, 
        then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set 
        and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on 
        out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` 
        and `IndexSplitter` is also available."
    
    """
    featurizer = PreGNNSupervisedFeaturizer(model.graph_wrapper)
    dataset = load_chembl_filtered_dataset(args.data_path,
                                           featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(dataset,
                                                    frac_train=0.9,
                                                    frac_valid=0,
                                                    frac_test=0.1)
    if args.distributed:
        indices = list(
            range(fleet.worker_index(), len(train_dataset),
                  fleet.worker_num()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    """
    Load the train function and calculate the train loss and test loss in each epoch.
    Here we set the epoch is in range of max epoch,you can change it if you want. 

    Then we will calculate the train loss ,test loss and print them.
    Finally we save the best epoch to the model according to the dataset.

    """
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset,
                           featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset,
                             featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe,
                                 '%s/epoch%s' % (args.model_dir, epoch_id),
                                 train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmax(list_test_loss)
        fluid.io.load_params(exe,
                             '%s/epoch%d' % (args.model_dir, best_epoch_id),
                             train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir),
                             train_prog)
        return list_test_loss[best_epoch_id]
Пример #19
0
def train(args):
    print("pretraining start")
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    with open(args.task_group_json) as f:
        task_group = json.load(f)

    exec_strategy = fluid.ExecutionStrategy()
    if args.use_fast_executor:
        exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = 4 if args.use_amp else 2
    exec_strategy.num_iteration_per_drop_scope = min(1, args.skip_steps)

    node_nums = int(os.getenv("PADDLE_NODES_NUM"))
    print("args.is_distributed:", args.is_distributed)
    num_trainers = 1
    trainer_id = 0
    
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        trainer_id = fleet.worker_index()
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = fleet.worker_endpoints()
        trainers_num = len(worker_endpoints)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
              .format(worker_endpoints, trainers_num, current_endpoint, trainer_id))

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.remove_unnecessary_lock = False # not useful
        dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False
        dist_strategy.nccl_comm_num = args.nccl_comm_num

        if args.use_hierarchical_allreduce \
            and trainers_num > args.hierarchical_allreduce_inter_nranks:
                dist_strategy.use_hierarchical_allreduce = args.use_hierarchical_allreduce
                dist_strategy.hierarchical_allreduce_inter_nranks = \
                        args.hierarchical_allreduce_inter_nranks
                assert dist_strategy.use_hierarchical_allreduce > 1
                assert trainers_num % dist_strategy.hierarchical_allreduce_inter_nranks == 0
                dist_strategy.hierarchical_allreduce_exter_nranks = \
                         trainers_num / dist_strategy.hierarchical_allreduce_inter_nranks

        if args.use_amp:
            dist_strategy.use_amp = True
            dist_strategy.amp_loss_scaling = args.init_loss_scaling
        if args.use_recompute:
            dist_strategy.forward_recompute = True
            dist_strategy.enable_sequential_execution=True

        trainer_id = fleet.worker_index()
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = fleet.worker_endpoints()
        trainers_num = len(worker_endpoints)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
              .format(worker_endpoints,trainers_num, current_endpoint, trainer_id))
    else:
        dist_strategy=None

    gpu_id=0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed:
        gpus = os.getenv("FLAGS_selected_gpus").split(",")
        gpu_id = int(gpus[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = len(gpus)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d, gpu_id:%d" % (dev_count, gpu_id))

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_pyreader, fetch_vars = create_model(
                pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group)
            graph_vars = fetch_vars["graph_vars"]
            checkpoints = fetch_vars["checkpoints"]
            total_loss = graph_vars[-1]
            if args.use_recompute:
                dist_strategy.recompute_checkpoints = checkpoints
            scheduled_lr, loss_scaling = optimization(
                loss=total_loss,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps,
                learning_rate=args.learning_rate,
                train_program=train_program,
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
                use_fp16=args.use_amp,
                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                init_loss_scaling=args.init_loss_scaling,
                incr_every_n_steps=args.incr_every_n_steps,
                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                incr_ratio=args.incr_ratio,
                decr_ratio=args.decr_ratio,
                dist_strategy=dist_strategy)    

    origin_train_program = train_program
    if args.is_distributed:
        #raped by fleet, need to assign fleet's modified train_grogram back
        train_program = fleet.main_program
        origin_train_program = fleet._origin_program

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_pyreader, fetch_vars = create_model(
                pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group)
            graph_vars = fetch_vars["graph_vars"]
            total_loss = graph_vars[-1]

    test_prog = test_prog.clone(for_test=True)
    
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    
    if args.init_checkpoint and args.init_checkpoint != "":
        #init_checkpoint(exe, args.init_checkpoint, origin_train_program, args.use_amp)
        init_pretraining_params(exe, args.init_checkpoint, origin_train_program, args.use_amp)

    data_reader = ErnieDataReader(
        task_group,
        False,
        batch_size=args.batch_size,
        vocab_path=args.vocab_path,
        voc_size=ernie_config['vocab_size'],
        epoch=args.epoch,
        max_seq_len=args.max_seq_len,
        generate_neg_sample=args.generate_neg_sample,
        hack_old_trainset=args.hack_old_data)
    
    #only fleet
    train_exe = exe

    predict = predict_wrapper(
        args,
        exe,
        ernie_config,
        task_group,
        test_prog=test_prog,
        pyreader=test_pyreader,
        fetch_list=[var.name for var in graph_vars])

    train_pyreader.set_batch_generator(data_reader.data_generator())
    train_pyreader.start()
    steps = 112000
    time_begin = time.time()
    node_nums = int(os.getenv("PADDLE_NODES_NUM"))
    while True:#steps < args.num_train_steps:
        try:
            steps += 1#node_nums
            skip_steps = args.skip_steps# * node_nums

            fetch_list = []
            if trainer_id == 0 and steps % skip_steps == 0:
                fetch_list = [var.name for var in graph_vars] + [scheduled_lr.name]
                if args.use_amp:
                    fetch_list.append(loss_scaling.name)

            outputs = train_exe.run(fetch_list=fetch_list, program=train_program)
            time_end = time.time()
            used_time = time_end - time_begin
            
            if outputs:
                each_mask_lm_cost, lm_w = outputs[:2]
                if args.use_amp:
                    each_total_constract_loss, each_total_cost, np_lr, l_scaling = outputs[-4:]
                else:
                    each_total_constract_loss, each_total_cost, np_lr = outputs[-3:]
                acc_list =[]
                index = 2
                for task in task_group:
                    each_task_acc = outputs[index]
                    task_w = outputs[index + 1]
                    acc = np.sum(each_task_acc * task_w) / np.sum(task_w)
                    acc_list.append("%s acc: %f" % (task["task_name"], acc))
                    index += 2

                print("feed_queue size", train_pyreader.queue.size())
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress()
                if args.use_amp:
                    print("current learning_rate:%f, loss scaling:%f" % (np_lr[0], l_scaling[0]))
                else:
                    print("current learning_rate:%f" % np_lr[0])
                print(
                    "epoch: %d, progress: %d/%d, step: %d, constract_loss: %f, loss: %f, "
                    "ppl: %f, %s, speed: %f steps/s, file: %s, mask_type: %s"
                    % (epoch, current_file_index, total_file, steps,
                       np.mean(each_total_constract_loss), np.mean(each_total_cost),
                       np.exp(np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w)),
                       ", ".join(acc_list), skip_steps / used_time,
                       current_file, mask_type))
                time_begin = time.time()
            elif steps % skip_steps == 0:
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
                )
                print("feed_queue size", train_pyreader.queue.size())
                print("epoch: %d, progress: %d/%d, step: %d, "
                        "speed: %f steps/s, file: %s, mask_type: %s"
                        % (epoch, current_file_index, total_file, steps,
                            skip_steps / used_time, current_file, mask_type))
                time_begin = time.time()

            if not trainer_id == 0:
                continue

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, origin_train_program)

            if steps % args.validation_steps == 0:
                valid_list = predict()
                print("[validation_set] epoch: %d, step: %d, %s" % \
                      (epoch, steps, ", ".join(valid_list)))

        except fluid.core.EOFException:
            train_pyreader.reset()
            break
def main(args):
    """tbd"""
    model_config = json.load(open(args.model_config, 'r'))
    model_config['context_pooling'] = args.context_pooling

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward(is_test=True)

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    k = model_config['layer_num']
    l1 = k - 1
    l2 = l1 + args.context_size
    featurizer = PreGNNContextPredFeaturizer(
            model.substruct_graph_wrapper, 
            model.context_graph_wrapper, 
            k, l1, l2)
    dataset = load_zinc_dataset(args.data_path, featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(
            dataset, frac_train=0.9, frac_valid=0, frac_test=0.1)
    if args.distributed:
        indices = list(range(fleet.worker_num(), len(train_dataset), fleet.worker_index()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset, featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmax(list_test_loss)
        fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
        return list_test_loss[best_epoch_id]
Пример #21
0
    def train_and_eval(self):
        """
        :return:
        """
        if self.is_fleet and fleet.is_server():
            logging.debug("is fleet.server, over")
            return
        if self.is_fleet:
            logging.debug("worker_index%d start train...." %
                          fleet.worker_index())
        self.data_set_reader.train_reader.run()
        steps = 1
        save_checkpoints_path = os.path.join(self.params["checkpoints"],
                                             "save_checkpoints")
        time_begin = time.time()
        while True:
            try:
                if steps % self.params["train_log_step"] != 0:
                    self.run(InstanceName.TRAINING, need_fetch=False)
                else:
                    metrics_tensor_value = self.run(InstanceName.TRAINING,
                                                    need_fetch=True)

                    fetch_list_dict = collections.OrderedDict()
                    for key, value in zip(self.fetch_list_train_key,
                                          metrics_tensor_value):
                        fetch_list_dict[key] = value
                    time_end = time.time()
                    used_time = time_end - time_begin
                    meta_info = collections.OrderedDict()
                    """ print train log """
                    log_info = ""
                    each_mask_lm_cost = fetch_list_dict['mask_lm_loss']
                    lm_w = fetch_list_dict['lm_weight']
                    learning_rate = fetch_list_dict["scheduled_lr"]
                    progress_out = self.data_set_reader.train_reader.get_progress(
                    )
                    epoch, current_file_index, total_file, current_file, mask_type = progress_out
                    metric = collections.OrderedDict()
                    metric["epoch"] = epoch
                    metric["progress"] = "{}/{}".format(
                        current_file_index, total_file)
                    metric["step"] = steps
                    metric["loss"] = np.mean(
                        fetch_list_dict[InstanceName.LOSS])
                    metric["ppl"] = np.exp(
                        np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w))
                    for task in self.model_class.task_group:
                        name = task['task_name']
                        if name == 'mask_language_model':
                            continue
                        each_task_acc = fetch_list_dict["acc_" + name]
                        task_w = fetch_list_dict["task_weight_of_" + name]
                        acc = np.sum(each_task_acc * task_w) / np.sum(task_w)
                        metric["acc_" + name] = acc
                    metric["file"] = current_file
                    metric["mask_type"] = mask_type
                    metric["speed"] = "{} steps/s".format(
                        self.params['train_log_step'] / used_time)
                    log_info += ", ".join(
                        [":".join([k, str(v)]) for k, v, in metric.items()])
                    if self.params['use_fp16']:
                        log_info += ", current_learning_rate:%f, loss_scaling:%f" \
                                    % (fetch_list_dict["scheduled_lr"], fetch_list_dict["loss_scaling"])
                    else:
                        log_info += ", current_learning_rate:{}".format(
                            fetch_list_dict["scheduled_lr"])
                    time_begin = time.time()
                    logging.info(log_info)

                if steps % self.params["eval_step"] == 0:
                    if self.params["is_eval_dev"]:
                        self.evaluate(self.data_set_reader.dev_reader,
                                      InstanceName.EVALUATE, steps)
                    if self.params["is_eval_test"]:
                        self.evaluate(self.data_set_reader.test_reader,
                                      InstanceName.TEST, steps)

                if self.trainer_id == 0:
                    if steps % self.params["save_model_step"] == 0:
                        self.save_models(save_checkpoints_path,
                                         None,
                                         steps,
                                         save_inference=False)
                steps += 1
            except fluid.core.EOFException:
                self.save_models(save_checkpoints_path,
                                 None,
                                 steps,
                                 save_inference=False)
                self.data_set_reader.train_reader.stop()
                break
        if self.params["is_eval_dev"]:
            logging.info("Final evaluate result: ")
            self.evaluate(self.data_set_reader.dev_reader,
                          InstanceName.EVALUATE, steps)
        if self.params["is_eval_test"]:
            logging.info("Final test result: ")
            self.evaluate(self.data_set_reader.test_reader, InstanceName.TEST,
                          steps)

        self.save_models(save_checkpoints_path,
                         None,
                         steps,
                         save_inference=False)
        logging.info("Save checkpoint done!")
        logging.info("train and eval done!")