예제 #1
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)

    startup_prog = fluid.Program()
    valid_prog = fluid.Program()
    valid_dataloader, valid_fetchs = program.build(
        config, valid_prog, startup_prog, is_train=False)
    valid_prog = valid_prog.clone(for_test=True)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    init_model(config, valid_prog, exe)

    valid_reader = Reader(config, 'valid')()
    valid_dataloader.set_sample_list_generator(valid_reader, place)

    compiled_valid_prog = program.compile(config, valid_prog)
    program.run(valid_dataloader, exe, compiled_valid_prog, valid_fetchs, -1,
                'eval', config)
예제 #2
0
def distribute_train(args):
    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False
    strategy.runtime_split_send_recv = True

    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        # 初始化及运行参数服务器节点
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        # 初始化工作节点
        fleet.init_worker()

        exe = fluid.Executor(fluid.CPUPlace())
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)
        dataset, file_list = get_dataset(inputs, args)
        for epoch in range(args.epochs):
            # 以文件为粒度进行shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[auc_var],
                                   fetch_info=["Epoch {} auc ".format(epoch)],
                                   print_period=100,
                                   debug=False)
            end_time = time.time()
            logger.info("epoch %d finished, use time=%d\n" %
                        ((epoch), end_time - start_time))

            # 默认使用0号节点保存模型
            if args.save_model and fleet.is_first_worker():
                model_path = os.path.join(str(args.model_path),
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)

        fleet.stop_worker()
        logger.info("Distribute Train Success!")
예제 #3
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    train_dataloader, train_fetchs = program.build(config,
                                                   train_prog,
                                                   startup_prog,
                                                   is_train=True)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(config,
                                                       valid_prog,
                                                       startup_prog,
                                                       is_train=False)
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = fluid.Executor(place=place)
    # only run startup_prog once to init
    exe.run(startup_prog)

    # load model from checkpoint or pretrained model
    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, place)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, place)
        compiled_valid_prog = program.compile(config, valid_prog)

    compiled_train_prog = fleet.main_program
    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train')
        # 2. validate with validate dataset
        if config.validate and epoch_id % config.valid_interval == 0:
            program.run(valid_dataloader, exe, compiled_valid_prog,
                        valid_fetchs, epoch_id, 'valid')

        # 3. save the persistable model
        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.ARCHITECTURE["name"])
            save_model(train_prog, model_path, epoch_id)
예제 #4
0
def get_distributed_optimizer(optimizer):
    """
    Get the default collective distributed optimizer under fleet.
    """
    dist_strategy = DistributedStrategy()
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)
    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
    return optimizer
예제 #5
0
파일: utils.py 프로젝트: zzsnow/PaddleHelix
def default_exe_params(is_distributed, use_cuda, thread_num):
    """
    Set the default execute parameters.
    """
    gpu_id = 0
    trainer_num = 1
    trainer_id = 0
    dist_strategy = None
    places = None
    if is_distributed:
        if use_cuda:
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)

            gpu_id = int(os.getenv("FLAGS_selected_gpus"))
            trainer_num = fleet.worker_num()
            trainer_id = fleet.worker_index()

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.use_experimental_executor = True
            exec_strategy.num_threads = 4
            exec_strategy.num_iteration_per_drop_scope = 1

            dist_strategy = DistributedStrategy()
            dist_strategy.exec_strategy = exec_strategy
            dist_strategy.nccl_comm_num = 2
            dist_strategy.fuse_all_reduce_ops = True

            dist_strategy.forward_recompute = True

            dist_strategy.use_amp = True
            dist_strategy.amp_loss_scaling = 12800.0

            places = fluid.cuda_places()
        else:
            print('Only gpu is supported for distributed mode at present.')
            exit(-1)
    else:
        if use_cuda:
            places = fluid.cuda_places()
        else:
            places = fluid.cpu_places(thread_num)
            os.environ['CPU_NUM'] = str(thread_num)

    if use_cuda:
        exe = fluid.Executor(fluid.CUDAPlace(gpu_id))
    else:
        exe = fluid.Executor(fluid.CPUPlace())

    return {
        'exe': exe,
        'trainer_num': trainer_num,
        'trainer_id': trainer_id,
        'gpu_id': gpu_id,
        'dist_strategy': dist_strategy,
        'places': places
    }
예제 #6
0
    def test_ps_rolemaker(self):
        os.environ["TRAINING_ROLE"] = "PSERVER"
        os.environ["POD_IP"] = "127.0.0.1"

        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
        ro.generate_role()
        self.assertFalse(ro.is_worker())
        self.assertTrue(ro.is_server())
        self.assertEqual(ro.worker_num(), 2)
예제 #7
0
    def append_additional_args(self, FLAGS):
        """
        append addtional args from the existing args
        """
        #dataset_dir and train_dir is defined in padllecloud, cannot be set by user
        role = role_maker.PaddleCloudRoleMaker() 
        fleet.init(role)

        return super(PaddleCloudFleetTrainer, self).append_additional_args(FLAGS)
예제 #8
0
파일: train.py 프로젝트: zbtpower/Knover
def train(args):
    """
    Train main function.
    """
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    train_generator = task.reader.data_generator(input_file=args.train_file,
                                                 num_epochs=args.num_epochs,
                                                 num_part=trainers_num,
                                                 part_id=trainer_id,
                                                 phase="train")
    valid_generator = task.reader.data_generator(
        input_file=args.valid_file,
        num_part=dev_count,
        part_id=gpu_id,
        phase="distributed_valid" if args.is_distributed else "valid")

    # run training
    model_timer = Timer()
    for step, data in enumerate(train_generator(), 1):
        model_timer.start()
        metrics = task.train_step(model, data)
        model_timer.pause()
        if step % args.log_steps == 0:
            time_cost = model_timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress(
            )
            print(
                f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                f"step: {step}, time: {time_cost:.3f}, "
                f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print("\tcurrent lr:", metrics.pop('scheduled_lr'))
            print("\t" + task.show_metrics(metrics))
            model_timer.reset()

        if step % args.validation_steps == 0:
            evaluate(task, model, valid_generator, args, dev_count, gpu_id)

        if step % args.save_steps == 0:
            save_path = f"{args.save_path}/step_{step}"
            model.save(save_path, is_checkpoint=True)
예제 #9
0
    def test_tr_rolemaker(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"

        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
        ro.generate_role()

        self.assertTrue(ro.is_worker())
        self.assertFalse(ro.is_server())
        self.assertEqual(ro.worker_num(), 2)
예제 #10
0
def main(args):
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = Metapath2vecModel(config=args)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size /
                            num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    real_batch_size = args.batch_size * args.walk_len * args.win_size
    if args.optimizer == "sgd":
        args.lr *= real_batch_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        dataset = m2vGraph(args)
        log.info("Build graph done.")

        data_generator = multiprocess_data_generator(args, dataset)

        cur_time = time.time()
        for idx, _ in enumerate(data_generator()):
            log.info("iter %s: %s s" % (idx, time.time() - cur_time))
            cur_time = time.time()
            if idx == 100:
                break

        pyreader.decorate_tensor_provider(data_generator)
        pyreader.start()

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
예제 #11
0
 def prepare_cpumulti_env(self, is_local):
     """
     :param is_local:
     :return:
     """
     if is_local:
         self.is_fleet = False
     else:
         role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         self.is_fleet = True
         logging.debug("init fleet cpu multi")
예제 #12
0
    def test_distributed_basic(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_distributed_basic")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        #basic
        exe, main_prog, startup_prog = self._generate()

        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)

        #fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        with fluid.program_guard(main_prog, startup_prog):
            dist_optimizer = fleet.distributed_optimizer(optimizer)
            dist_optimizer.minimize(loss)

        exe.run(startup_prog)

        o = None
        i = 0
        name = None
        for i in acp.train_epoch_range(3, 0):
            o = acp._get_train_epoch_range()
            name = o.name
            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))

            for data in data_loader():
                fetch = exe.run(fleet.main_program,
                                feed=data,
                                fetch_list=[loss])

            self.assertEqual(len(o._exe_status), 1)

        o = acp._get_train_epoch_range()
        assert o == None, "now train epoch must not exits now"
        self.assertEqual(i, 2)

        fs.delete(save_dir)

        logger.info("end test_distributed_basic")
예제 #13
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    place = env.place()

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    train_dataloader, train_fetchs = program.build(
        config, train_prog, startup_prog, is_train=True)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(
            config, valid_prog, startup_prog, is_train=False)
        valid_prog = valid_prog.clone(for_test=True)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, place)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, place)
        compiled_valid_prog = program.compile(config, valid_prog)

    compiled_train_prog = fleet.main_program
    for epoch_id in range(config.epochs):
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train')

        if config.validate and epoch_id % config.valid_interval == 0:
            program.run(valid_dataloader, exe, compiled_valid_prog,
                        valid_fetchs, epoch_id, 'valid')

        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.architecture)
            save_model(train_prog, model_path, epoch_id)
예제 #14
0
 def run_nccl_trainer(self, args):
     """run fleet api"""
     assert args.update_method == "nccl"
     import paddle.fluid as fluid
     import six
     from paddle.fluid.incubate.fleet.collective import fleet
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = args.run_params['num_threads']
     #dist_strategy = DistributedStrategy()
     #dist_strategy.exec_strategy = exec_strategy
     #dist_strategy.fuse_memory_size = 1  # MB
     #dist_strategy.fuse_laryer_size = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     avg_cost = self.net(args)
     losses = self.do_training(fleet,args)
     losses = "" if not losses else losses
     print(losses)
예제 #15
0
 def run_nccl_trainer(self, args):
     """
     run nccl trainer, used for gpu case.
     Args:
         args (ArgumentParser): run args to config dist fleet.
     """
     assert args.update_method == "nccl"
     from paddle.fluid.incubate.fleet.collective import fleet
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = args.run_params['num_threads']
     #dist_strategy = DistributedStrategy()
     #dist_strategy.exec_strategy = exec_strategy
     #dist_strategy.fuse_memory_size = 1  # MB
     #dist_strategy.fuse_laryer_size = 1
     if args.role.upper() != "TRAINER":
         raise ValueError("args role must be TRAINER")
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     avg_cost = self.net(args)
     losses = self.do_training(fleet, args)
     losses = "" if not losses else losses
     print(losses)
예제 #16
0
    def _test_check_point(self, fs, dir_path):
        file_name = "persistables"

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
        feeder = fluid.DataFeeder(feed_list=[image, label],
                                  place=fluid.CPUPlace())
        predict = fluid.layers.fc(input=image, size=10, act='softmax')
        loss = fluid.layers.cross_entropy(input=predict, label=label)
        avg_loss = fluid.layers.mean(loss)
        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)

        dist_optimizer = fleet.distributed_optimizer(optimizer)
        dist_optimizer.minimize(avg_loss)

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())

        status = TrainStatus(2)
        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
        n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)

        status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
        self.assertEqual(status2, status)

        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
        n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
        self.assertEqual(n2, n1 + 1)

        fleet.clean_redundant_check_points(dir_path, fs=fs)
예제 #17
0
 def test_traing_role(self):
     os.environ["TRAINING_ROLE"] = "TEST"
     ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
     self.assertRaises(ValueError, ro.generate_role)
예제 #18
0
파일: learner.py 프로젝트: zhch8888168/PGL
 def __init__(self):
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     cfleet.init(role)
예제 #19
0
def compress(args):
    shuffle = True
    if args.ce_test:
        # set seed
        seed = 111
        paddle.seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        args.num_workers = 0
        shuffle = False

    env = os.environ
    num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1))
    use_data_parallel = num_trainers > 1

    if use_data_parallel:
        # Fleet step 1: initialize the distributed environment
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

    train_reader = None
    test_reader = None
    if args.data == "mnist":
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.MNIST(
            mode='train', backend="cv2", transform=transform)
        val_dataset = paddle.vision.datasets.MNIST(
            mode='test', backend="cv2", transform=transform)
        class_dim = 10
        image_shape = "1,28,28"
        args.pretrained_model = False
    elif args.data == "cifar10":
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.Cifar10(
            mode="train", backend="cv2", transform=transform)
        val_dataset = paddle.vision.datasets.Cifar10(
            mode="test", backend="cv2", transform=transform)
        class_dim = 10
        image_shape = "3, 32, 32"
        args.pretrained_model = False
    elif args.data == "imagenet":
        import imagenet_reader as reader
        train_dataset = reader.ImageNetDataset(mode='train')
        val_dataset = reader.ImageNetDataset(mode='val')
        class_dim = 1000
        image_shape = "3,224,224"
    else:
        raise ValueError("{} is not supported.".format(args.data))
    image_shape = [int(m) for m in image_shape.split(",")]
    assert args.model in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    if args.use_gpu:
        places = paddle.static.cuda_places()
    else:
        places = paddle.static.cpu_places()
    place = places[0]
    exe = paddle.static.Executor(place)

    image = paddle.static.data(
        name='image', shape=[None] + image_shape, dtype='float32')
    label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')

    batch_size_per_card = args.batch_size
    batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset,
        batch_size=batch_size_per_card,
        shuffle=shuffle,
        drop_last=True)

    train_loader = paddle.io.DataLoader(
        train_dataset,
        places=place,
        batch_sampler=batch_sampler,
        feed_list=[image, label],
        return_list=False,
        use_shared_memory=True,
        num_workers=args.num_workers)

    valid_loader = paddle.io.DataLoader(
        val_dataset,
        places=place,
        feed_list=[image, label],
        drop_last=False,
        return_list=False,
        use_shared_memory=True,
        batch_size=args.batch_size_for_validation,
        shuffle=False)

    step_per_epoch = int(
        np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers))

    # model definition
    model = models.__dict__[args.model]()
    out = model.net(input=image, class_dim=class_dim)
    if args.data == 'cifar10':
        label = paddle.reshape(label, [-1, 1])
    cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label)
    avg_cost = paddle.mean(x=cost)
    acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
    acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)

    val_program = paddle.static.default_main_program().clone(for_test=True)

    opt, learning_rate = create_optimizer(args, step_per_epoch)

    # Fleet step 2: distributed strategy
    if use_data_parallel:
        dist_strategy = DistributedStrategy()
        dist_strategy.sync_batch_norm = False
        dist_strategy.exec_strategy = paddle.static.ExecutionStrategy()
        dist_strategy.fuse_all_reduce_ops = False

    train_program = paddle.static.default_main_program()

    if args.pruning_strategy == 'gmp':
        # GMP pruner step 0: define configs for GMP, no need to define configs for the base training.
        configs = {
            'stable_iterations': args.stable_epochs * step_per_epoch,
            'pruning_iterations': args.pruning_epochs * step_per_epoch,
            'tunning_iterations': args.tunning_epochs * step_per_epoch,
            'resume_iteration': (args.last_epoch + 1) * step_per_epoch,
            'pruning_steps': args.pruning_steps,
            'initial_ratio': args.initial_ratio,
        }
    elif args.pruning_strategy == 'base':
        configs = None

    # GMP pruner step 1: initialize a pruner object by calling entry function.
    pruner = create_unstructured_pruner(
        train_program, args, place, configs=configs)

    if use_data_parallel:
        # Fleet step 3: decorate the origial optimizer and minimize it
        opt = fleet.distributed_optimizer(opt, strategy=dist_strategy)
    opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set)

    exe.run(paddle.static.default_startup_program())
    if args.last_epoch > -1:
        assert args.checkpoint is not None and os.path.exists(
            args.checkpoint), "Please specify a valid checkpoint path."
        paddle.fluid.io.load_persistables(
            executor=exe, dirname=args.checkpoint, main_program=train_program)

    elif args.pretrained_model:
        assert os.path.exists(
            args.
            pretrained_model), "Pretrained model path {} doesn't exist".format(
                args.pretrained_model)

        def if_exist(var):
            return os.path.exists(os.path.join(args.pretrained_model, var.name))

        _logger.info("Load pretrained model from {}".format(
            args.pretrained_model))
        # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. 
        # Please consider using paddle.static.load(program, model_path) when possible
        paddle.fluid.io.load_vars(
            exe, args.pretrained_model, predicate=if_exist)

    def test(epoch, program):
        acc_top1_ns = []
        acc_top5_ns = []

        _logger.info(
            "The current sparsity of the inference model is {}%".format(
                round(100 * UnstructuredPruner.total_sparse(
                    paddle.static.default_main_program()), 2)))
        for batch_id, data in enumerate(valid_loader):
            start_time = time.time()
            acc_top1_n, acc_top5_n = exe.run(
                program, feed=data, fetch_list=[acc_top1.name, acc_top5.name])
            end_time = time.time()
            if batch_id % args.log_period == 0:
                _logger.info(
                    "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".
                    format(epoch, batch_id,
                           np.mean(acc_top1_n),
                           np.mean(acc_top5_n), end_time - start_time))
            acc_top1_ns.append(np.mean(acc_top1_n))
            acc_top5_ns.append(np.mean(acc_top5_n))

        _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format(
            epoch,
            np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))

    def train(epoch, program):
        train_reader_cost = 0.0
        train_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()
        for batch_id, data in enumerate(train_loader):
            train_reader_cost += time.time() - reader_start
            train_start = time.time()
            loss_n, acc_top1_n, acc_top5_n = exe.run(
                program,
                feed=data,
                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
            # GMP pruner step 2: step() to update ratios and other internal states of the pruner.
            pruner.step()
            train_run_cost += time.time() - train_start
            total_samples += args.batch_size
            loss_n = np.mean(loss_n)
            acc_top1_n = np.mean(acc_top1_n)
            acc_top5_n = np.mean(acc_top5_n)
            if batch_id % args.log_period == 0:
                _logger.info(
                    "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
                    format(epoch, batch_id,
                           learning_rate.get_lr(), loss_n, acc_top1_n,
                           acc_top5_n, train_reader_cost / args.log_period, (
                               train_reader_cost + train_run_cost
                           ) / args.log_period, total_samples / args.log_period,
                           total_samples / (train_reader_cost + train_run_cost
                                            )))
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            learning_rate.step()
            reader_start = time.time()

    if use_data_parallel:
        # Fleet step 4: get the compiled program from fleet
        compiled_train_program = fleet.main_program
    else:
        compiled_train_program = paddle.static.CompiledProgram(
            paddle.static.default_main_program())

    for i in range(args.last_epoch + 1, args.num_epochs):
        train(i, compiled_train_program)
        # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. 
        pruner.update_params()

        _logger.info("The current sparsity of the pruned model is: {}%".format(
            round(100 * UnstructuredPruner.total_sparse(
                paddle.static.default_main_program()), 2)))

        if (i + 1) % args.test_period == 0:
            test(i, val_program)
        if (i + 1) % args.model_period == 0:
            if use_data_parallel:
                fleet.save_persistables(executor=exe, dirname=args.model_path)
            else:
                paddle.fluid.io.save_persistables(
                    executor=exe, dirname=args.model_path)
def main():
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)  # new line 3
    fleet.init(role)  # new line 4
    env = os.environ

    num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 0))
    assert num_trainers != 0, "multi-machine training process must be started using distributed.launch..."
    trainer_id = int(env.get("PADDLE_TRAINER_ID", 0))

    # set different seeds for different trainers
    random.seed(trainer_id)
    np.random.seed(trainer_id)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    save_only = getattr(cfg, 'save_prediction_only', False)
    if save_only:
        raise NotImplementedError('The config file only support prediction,'
                                  ' training stage is not implemented now')
    main_arch = cfg.architecture

    assert cfg.use_gpu == True, "GPU must be supported for multi-machine training..."
    devices_num = fluid.core.get_cuda_device_count()

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    if FLAGS.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)

                dist_strategy = DistributedStrategy()
                sync_bn = getattr(model.backbone, 'norm_type',
                                  None) == 'sync_bn'
                dist_strategy.sync_batch_norm = sync_bn
                dist_strategy.nccl_comm_num = 1
                exec_strategy = fluid.ExecutionStrategy()
                exec_strategy.num_threads = 3
                exec_strategy.num_iteration_per_drop_scope = 30
                dist_strategy.exec_strategy = exec_strategy
                dist_strategy.fuse_all_reduce_ops = True
                optimizer = fleet.distributed_optimizer(
                    optimizer, strategy=dist_strategy)  # new line 5

                optimizer.minimize(loss)

                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

            if 'use_ema' in cfg and cfg['use_ema']:
                global_steps = _decay_step_counter()
                ema = ExponentialMovingAverage(cfg['ema_decay'],
                                               thres_steps=global_steps)
                ema.update()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader, devices_num=1)
        # When iterable mode, set set_sample_list_generator(eval_reader, place)
        eval_loader.set_sample_list_generator(eval_reader)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    exe.run(startup_prog)
    compiled_train_prog = fleet.main_program

    if FLAGS.eval:
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg,
                                 devices_num=devices_num)
    # When iterable mode, set set_sample_list_generator(train_reader, place)
    train_loader.set_sample_list_generator(train_reader)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_iter, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_iter)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use VisualDL to log data
    if FLAGS.use_vdl:
        assert six.PY3, "VisualDL requires Python >= 3.5"
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir)
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use vdl-paddle to log loss
        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and trainer_id == 0:
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        # NOTE : profiler tools, used for benchmark
        if FLAGS.is_profiler and it == 5:
            profiler.start_profiler("All")
        elif FLAGS.is_profiler and it == 10:
            profiler.stop_profiler("total", FLAGS.profiler_path)
            return


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and trainer_id == 0:
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.apply_program)
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                resolution = None
                if 'Mask' in cfg.architecture:
                    resolution = model.mask_head.resolution
                results = eval_run(exe,
                                   compiled_eval_prog,
                                   eval_loader,
                                   eval_keys,
                                   eval_values,
                                   eval_cls,
                                   cfg,
                                   resolution=resolution)
                box_ap_stats = eval_results(results, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type,
                                            cfg['EvalReader']['dataset'])

                # use vdl_paddle to log mAP
                if FLAGS.use_vdl:
                    vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step)
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.restore_program)

    train_loader.reset()
def main(args):
    cfg = XiaoduHiConfig()
    cfg.scene_sensor_algo = 'yolov4'

    wae_ndarray = np.load(os.path.join(args.wae_dir, 'raw_wae.npy'))
    start_epoch = 0

    train_program = fluid.Program()
    startup_program = fluid.Program()

    with fluid.program_guard(train_program, startup_program):
        attention_ctrl = AttentionController(
            inputs_type=args.inputs_type,
            num_actions=wae_ndarray.shape[0],
            act_tr_dim=wae_ndarray.shape[1],
            act_emb_ndarray=wae_ndarray,
            num_frames=cfg.ob_window_len,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            model_dim=args.model_dim,
            num_decoder_blocks=args.num_decoder_blocks,
            num_heads=args.num_heads,
            ffn_dim=args.ffn_dim,
            dropout=args.dropout,
            normalize_before=args.normalize_before,
            frame_emb_trainable=args.frame_emb_trainable,
            trigger_loss_coef=args.trigger_loss_coef,
            obj_loss_coef=args.obj_loss_coef,
            act_loss_coef=args.act_loss_coef,
            use_last_act_loss=args.use_last_act_loss,
            mode='train')
        preds = attention_ctrl.predict()
        test_program = train_program.clone(for_test=True)

        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=args.lr,
            regularization=fluid.regularizer.L2Decay(
                regularization_coeff=0.1))
        if args.distributed_training:
            optimizer = fleet.distributed_optimizer(optimizer)
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)

        optimizer.minimize(attention_ctrl.loss)

    if args.distributed_training:
        place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
    else:
        place = fluid.CUDAPlace(args.gpu)

    exe = fluid.Executor(place)
    exe.run(startup_program)

    if args.inputs_type.startswith('inst_crop') and \
       args.inputs_type != 'inst_crop_wo_crop':
        fluid.io.load_vars(
            exe, MobileNetV2_Pretrained,
            main_program=train_program,
            predicate=lambda v: os.path.exists(
                os.path.join(MobileNetV2_Pretrained, v.name)))
        print('Loaded weights from {}'.format(MobileNetV2_Pretrained))

    if args.init_params is not None:
        base = os.path.basename(args.init_params)
        if base.startswith('epoch_'):
            start_epoch = int(base[len('epoch_'):]) + 1

        tb_state = os.path.join(args.init_params, 'tb_state.txt')
        if os.path.exists(tb_state):
            global _update_step
            global _eval_step
            with open(tb_state, 'r') as f:
                update_step, eval_step = f.readline().split(' ')
                _update_step = int(update_step)
                _eval_step = int(eval_step)

        fluid.io.load_vars(
            exe, args.init_params,
            main_program=train_program,
            predicate=lambda v: os.path.exists(
                os.path.join(args.init_params, v.name)))
        print('Loaded weights from {}'.format(args.init_params))

    if args.distributed_training:
        train_worker_gpus = [int(os.environ.get('FLAGS_selected_gpus', 0))]
        test_worker_gpus = train_worker_gpus
    else:
        train_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_train)
        test_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_test)

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    if not args.use_decord:
        train_dataloader = XiaoduHiDataloaderv2(
            attention_ctrl.feed_list, [place], args.yolov4_model_dir,
            args.video_tracking_dir, args.train_dataset,
            full_neg_txt=args.full_neg_train,
            batch_size=args.bs,
            num_workers=args.data_workers_for_train,
            worker_gpus=train_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            ob_window_len=cfg.ob_window_len,
            interval=cfg.interval,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            augment=False,
            resample_negs_per_epoch=True)
        test_dataloader = XiaoduHiDataloaderv2(
            attention_ctrl.feed_list, [place], args.yolov4_model_dir,
            args.video_tracking_dir, args.test_dataset,
            full_neg_txt=args.full_neg_test,
            batch_size=args.bs,
            num_workers=args.data_workers_for_test,
            worker_gpus=test_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            ob_window_len=cfg.ob_window_len,
            interval=cfg.interval,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            augment=False,
            resample_negs_per_epoch=False,
            for_test=True)

        test_dataloader.save_to_txt(
            os.path.join(args.save, 'eval_data.txt'), dt=200)
    else:
        train_dataloader = XiaoduHiDecordLoader(
            attention_ctrl.feed_list, [place],
            args.yolov4_model_dir, args.decord_ds_pkl,
            decord_readers=args.decord_readers,
            yolov4_detectors=args.decord_detectors,
            post_workers=args.decord_post_workers,
            batch_size=args.bs,
            detector_gpus=train_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            for_test=False)
        test_dataloader = XiaoduHiDecordLoader(
            attention_ctrl.feed_list, [place],
            args.yolov4_model_dir, args.decord_ds_pkl,
            decord_readers=args.decord_readers,
            yolov4_detectors=args.decord_detectors,
            post_workers=args.decord_post_workers,
            batch_size=args.bs,
            detector_gpus=test_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            for_test=True)

    train_dataloader.start_workers()
    test_dataloader.start_workers()

    train_log = os.path.join(args.save, 'loss.csv')
    eval_log = os.path.join(args.save, 'eval.txt')
    with open(os.path.join(args.save, 'args.txt'), 'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        logdir=os.path.join(args.save, 'logdir'),
        purge_step=None if _update_step == 0 else _update_step)

    worker_index = None if not args.distributed_training \
        else fleet.worker_index()

    # if worker_index == 0:
    #     eval_model(exe, test_program, preds, attention_ctrl.act_loss,
    #                test_dataloader, -1, log_file=eval_log,
    #                tb_writer=tb_writer, worker_index=worker_index)
    for epoch_id in range(start_epoch, args.epochs):
        print('--------------- Epoch %d ---------------' % epoch_id)
        train_epoch(exe, train_program, attention_ctrl, train_dataloader,
                    log_file=train_log, tb_writer=tb_writer,
                    worker_index=worker_index)

        save_dir = os.path.join(args.save, 'epoch_{}'.format(epoch_id))
        shutil.rmtree(save_dir, ignore_errors=True)
        os.mkdir(save_dir)
        fluid.io.save_params(exe, save_dir, main_program=train_program)

        if epoch_id > 0 and epoch_id % args.run_eval_after_epochs == 0:
            eval_model(exe, test_program, preds, attention_ctrl.act_loss,
                       test_dataloader, epoch_id, log_file=eval_log,
                       tb_writer=tb_writer)

        tb_state = os.path.join(save_dir, 'tb_state.txt')
        with open(tb_state, 'w') as f:
            f.write('{} {}'.format(_update_step, _eval_step))

    if epoch_id % args.run_eval_after_epochs != 0:
        eval_model(exe, test_program, preds, attention_ctrl.act_loss,
                   test_dataloader, epoch_id, log_file=eval_log,
                   tb_writer=tb_writer, worker_index=worker_index)

    train_dataloader.stop_workers()
    test_dataloader.stop_workers()
예제 #22
0
    def run_gpu_fleet_api_trainer(self, args):
        assert args.update_method == "nccl2"

        self.lr = args.lr

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.fuse_memory_size = 1  # MB
        dist_strategy.fuse_laryer_size = 1
        if args.use_local_sgd:
            dist_strategy.use_local_sgd = True
        if args.ut4grad_allreduce:
            dist_strategy._ut4grad_allreduce = True
        if args.sync_batch_norm:
            dist_strategy.sync_batch_norm = True

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        print_to_err("gpu_fleet", "fleet.node_num:")
        # "fleet.node_id:", fleet.node_id(),
        # "fleet.trainer_num:", fleet.worker_num())

        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)

        trainer_prog = fleet._origin_program
        dist_prog = fleet.main_program

        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
        place = fluid.CUDAPlace(device_id)

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        eprint(type(self).__name__, "run worker startup program done.")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        eprint("feed_var_list:", feed_var_list)

        # tmp add this code to pass python35 gcc8 CI
        # Fixme(gongweibao, wangxi), need fix fleet api program order
        if feed_var_list[0].name == 'label':
            feed_var_list = feed_var_list[::-1]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(dist_prog,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))

        if args.save_model:
            model_save_dir = "/tmp"
            if fleet.worker_index() == 0:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer")
            else:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables_2")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables_2")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer_2")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer_2")
            fluid.io.save_persistables(exe, model_save_dir_fluid,
                                       fleet._origin_program)
            fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
            feeded_var_names = [var.name for var in feed_var_list]
            fluid.io.save_inference_model(infer_save_dir_fluid,
                                          feeded_var_names, [avg_cost], exe,
                                          fleet._origin_program)
            fleet.save_inference_model(exe, infer_save_dir_fleet,
                                       feeded_var_names, [avg_cost])
예제 #23
0
def train(args):
    print("pretraining start")
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    with open(args.task_group_json) as f:
        task_group = json.load(f)

    exec_strategy = fluid.ExecutionStrategy()
    if args.use_fast_executor:
        exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = 4 if args.use_amp else 2
    exec_strategy.num_iteration_per_drop_scope = min(1, args.skip_steps)

    node_nums = int(os.getenv("PADDLE_NODES_NUM"))
    print("args.is_distributed:", args.is_distributed)
    num_trainers = 1
    trainer_id = 0
    
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        trainer_id = fleet.worker_index()
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = fleet.worker_endpoints()
        trainers_num = len(worker_endpoints)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
              .format(worker_endpoints, trainers_num, current_endpoint, trainer_id))

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.remove_unnecessary_lock = False # not useful
        dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False
        dist_strategy.nccl_comm_num = args.nccl_comm_num

        if args.use_hierarchical_allreduce \
            and trainers_num > args.hierarchical_allreduce_inter_nranks:
                dist_strategy.use_hierarchical_allreduce = args.use_hierarchical_allreduce
                dist_strategy.hierarchical_allreduce_inter_nranks = \
                        args.hierarchical_allreduce_inter_nranks
                assert dist_strategy.use_hierarchical_allreduce > 1
                assert trainers_num % dist_strategy.hierarchical_allreduce_inter_nranks == 0
                dist_strategy.hierarchical_allreduce_exter_nranks = \
                         trainers_num / dist_strategy.hierarchical_allreduce_inter_nranks

        if args.use_amp:
            dist_strategy.use_amp = True
            dist_strategy.amp_loss_scaling = args.init_loss_scaling
        if args.use_recompute:
            dist_strategy.forward_recompute = True
            dist_strategy.enable_sequential_execution=True

        trainer_id = fleet.worker_index()
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = fleet.worker_endpoints()
        trainers_num = len(worker_endpoints)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
              .format(worker_endpoints,trainers_num, current_endpoint, trainer_id))
    else:
        dist_strategy=None

    gpu_id=0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed:
        gpus = os.getenv("FLAGS_selected_gpus").split(",")
        gpu_id = int(gpus[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = len(gpus)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d, gpu_id:%d" % (dev_count, gpu_id))

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_pyreader, fetch_vars = create_model(
                pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group)
            graph_vars = fetch_vars["graph_vars"]
            checkpoints = fetch_vars["checkpoints"]
            total_loss = graph_vars[-1]
            if args.use_recompute:
                dist_strategy.recompute_checkpoints = checkpoints
            scheduled_lr, loss_scaling = optimization(
                loss=total_loss,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps,
                learning_rate=args.learning_rate,
                train_program=train_program,
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
                use_fp16=args.use_amp,
                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                init_loss_scaling=args.init_loss_scaling,
                incr_every_n_steps=args.incr_every_n_steps,
                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                incr_ratio=args.incr_ratio,
                decr_ratio=args.decr_ratio,
                dist_strategy=dist_strategy)    

    origin_train_program = train_program
    if args.is_distributed:
        #raped by fleet, need to assign fleet's modified train_grogram back
        train_program = fleet.main_program
        origin_train_program = fleet._origin_program

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_pyreader, fetch_vars = create_model(
                pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group)
            graph_vars = fetch_vars["graph_vars"]
            total_loss = graph_vars[-1]

    test_prog = test_prog.clone(for_test=True)
    
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    
    if args.init_checkpoint and args.init_checkpoint != "":
        #init_checkpoint(exe, args.init_checkpoint, origin_train_program, args.use_amp)
        init_pretraining_params(exe, args.init_checkpoint, origin_train_program, args.use_amp)

    data_reader = ErnieDataReader(
        task_group,
        False,
        batch_size=args.batch_size,
        vocab_path=args.vocab_path,
        voc_size=ernie_config['vocab_size'],
        epoch=args.epoch,
        max_seq_len=args.max_seq_len,
        generate_neg_sample=args.generate_neg_sample,
        hack_old_trainset=args.hack_old_data)
    
    #only fleet
    train_exe = exe

    predict = predict_wrapper(
        args,
        exe,
        ernie_config,
        task_group,
        test_prog=test_prog,
        pyreader=test_pyreader,
        fetch_list=[var.name for var in graph_vars])

    train_pyreader.set_batch_generator(data_reader.data_generator())
    train_pyreader.start()
    steps = 112000
    time_begin = time.time()
    node_nums = int(os.getenv("PADDLE_NODES_NUM"))
    while True:#steps < args.num_train_steps:
        try:
            steps += 1#node_nums
            skip_steps = args.skip_steps# * node_nums

            fetch_list = []
            if trainer_id == 0 and steps % skip_steps == 0:
                fetch_list = [var.name for var in graph_vars] + [scheduled_lr.name]
                if args.use_amp:
                    fetch_list.append(loss_scaling.name)

            outputs = train_exe.run(fetch_list=fetch_list, program=train_program)
            time_end = time.time()
            used_time = time_end - time_begin
            
            if outputs:
                each_mask_lm_cost, lm_w = outputs[:2]
                if args.use_amp:
                    each_total_constract_loss, each_total_cost, np_lr, l_scaling = outputs[-4:]
                else:
                    each_total_constract_loss, each_total_cost, np_lr = outputs[-3:]
                acc_list =[]
                index = 2
                for task in task_group:
                    each_task_acc = outputs[index]
                    task_w = outputs[index + 1]
                    acc = np.sum(each_task_acc * task_w) / np.sum(task_w)
                    acc_list.append("%s acc: %f" % (task["task_name"], acc))
                    index += 2

                print("feed_queue size", train_pyreader.queue.size())
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress()
                if args.use_amp:
                    print("current learning_rate:%f, loss scaling:%f" % (np_lr[0], l_scaling[0]))
                else:
                    print("current learning_rate:%f" % np_lr[0])
                print(
                    "epoch: %d, progress: %d/%d, step: %d, constract_loss: %f, loss: %f, "
                    "ppl: %f, %s, speed: %f steps/s, file: %s, mask_type: %s"
                    % (epoch, current_file_index, total_file, steps,
                       np.mean(each_total_constract_loss), np.mean(each_total_cost),
                       np.exp(np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w)),
                       ", ".join(acc_list), skip_steps / used_time,
                       current_file, mask_type))
                time_begin = time.time()
            elif steps % skip_steps == 0:
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
                )
                print("feed_queue size", train_pyreader.queue.size())
                print("epoch: %d, progress: %d/%d, step: %d, "
                        "speed: %f steps/s, file: %s, mask_type: %s"
                        % (epoch, current_file_index, total_file, steps,
                            skip_steps / used_time, current_file, mask_type))
                time_begin = time.time()

            if not trainer_id == 0:
                continue

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, origin_train_program)

            if steps % args.validation_steps == 0:
                valid_list = predict()
                print("[validation_set] epoch: %d, step: %d, %s" % \
                      (epoch, steps, ", ".join(valid_list)))

        except fluid.core.EOFException:
            train_pyreader.reset()
            break
예제 #24
0
def train(args):
    """
    Train main function.
    """
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    train_generator = task.get_data_loader(model,
                                           input_file=args.train_file,
                                           num_epochs=args.num_epochs,
                                           num_part=trainers_num,
                                           part_id=trainer_id,
                                           phase="train")
    valid_generator = task.get_data_loader(
        model,
        input_file=args.valid_file,
        num_part=dev_count,
        part_id=gpu_id,
        phase="distributed_valid" if args.is_distributed else "valid")

    # run training
    timer = Timer()
    timer.start()
    if args.Model.model == 'NSPModel':
        best_metrics = 0.0
    else:
        best_metrics = 10000
    shuffledatafile()
    for step, data in enumerate(train_generator(), args.start_step + 1):
        outputs = task.train_step(model, data)
        timer.pause()
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress(
            )
            print(
                f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                f"step: {step}, time: {time_cost:.3f}, "
                f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print(f"\tcurrent lr: {outputs.pop('scheduled_lr'):.7f}")
            metrics = task.get_metrics(outputs)
            print("\t" + ", ".join(f"{k}: {v:.4f}"
                                   for k, v in metrics.items()))
            timer.reset()

        if step % args.validation_steps == 0:

            # shuffledatafile()
            metrics = evaluate(task, model, valid_generator, args, dev_count,
                               gpu_id, step)
            if args.Model.model == 'NSPModel' and metrics[
                    'nsp_acc'] > best_metrics:
                best_metrics = metrics['nsp_acc']
                save_path = f"{args.save_path}/step_{step}_{best_metrics}"
                model.save(save_path, is_checkpoint=True)

            elif args.Model.model == 'Plato' and metrics['loss'] < best_metrics:
                best_metrics = metrics['loss']
                save_path = f"{args.save_path}/step_{step}_{best_metrics}"
                model.save(save_path, is_checkpoint=True)
        # if step % args.save_steps == 0 and trainer_id == 0:
        #     save_path = f"{args.save_path}/step_{step}"
        #     model.save(save_path, is_checkpoint=True)
        #     with open(save_path + ".finish", "w") as f:
        #         pass

        timer.start()
예제 #25
0
    def run_gpu_fleet_api_trainer(self, args):
        assert args.update_method == "nccl2"

        self.lr = args.lr

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.fuse_memory_size = 1  # MB
        dist_strategy.fuse_laryer_size = 1
        if args.use_local_sgd:
            dist_strategy.use_local_sgd = True
        if args.ut4grad_allreduce:
            dist_strategy._ut4grad_allreduce = True

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        print_to_err("gpu_fleet", "fleet.node_num:")
        # "fleet.node_id:", fleet.node_id(),
        # "fleet.trainer_num:", fleet.worker_num())

        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)

        trainer_prog = fleet._origin_program
        dist_prog = fleet.main_program

        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
        place = fluid.CUDAPlace(device_id)

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        eprint(type(self).__name__, "run worker startup program done.")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(dist_prog,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))
dataset.set_pipe_command(pipe_command)
dataset.set_batch_size(32)
dataset.set_thread(10)
dataset.set_hdfs_config("hdfs://192.168.48.87:9000", "root,")
optimizer = fluid.optimizer.SGD(0.0001)
#optimizer.minimize(avg_cost)
exe = fluid.Executor(fluid.CPUPlace())

input_folder = "hdfs:"
output = sp.check_output(
    "hdfs dfs -ls /train_data | awk '{if(NR>1) print $8}'", shell=True)
train_filelist = [
    "{}{}".format(input_folder, f)
    for f in output.decode('ascii').strip().split('\n')
]
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)

config = DistributeTranspilerConfig()
config.sync_mode = False

optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(avg_cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
elif fleet.is_worker():
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    fleet.init_worker()
    def _test_checkpoint(self, fs, dir_path):
        file_name = "persistables"

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
        feeder = fluid.DataFeeder(feed_list=[image, label],
                                  place=fluid.CPUPlace())
        predict = fluid.layers.fc(input=image, size=10, act='softmax')
        loss = fluid.layers.cross_entropy(input=predict, label=label)
        avg_loss = fluid.layers.mean(loss)
        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)

        dist_optimizer = fleet.distributed_optimizer(optimizer)
        dist_optimizer.minimize(avg_loss)

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())

        status = ExeTrainStatus()
        status.epoch_no = 2
        _, n1 = fleet.save_checkpoint(exe,
                                      dir_path,
                                      trainer_id=0,
                                      train_status=status,
                                      fs=fs)

        status2 = ExeTrainStatus()
        fleet.load_checkpoint(exe,
                              dir_path,
                              trainer_id=0,
                              fs=fs,
                              train_status=status2)
        self.assertEqual(status2, status)

        _, n2 = fleet.save_checkpoint(exe,
                                      dir_path,
                                      trainer_id=0,
                                      train_status=status,
                                      fs=fs,
                                      remain_all_checkpoint=False)
        self.assertEqual(n2, n1 + 1)

        c = CheckpointSaver(fs)
        cp_nos = c.get_checkpoint_no(dir_path)
        assert len(cp_nos) == 1  # cleanup all others

        # unnormal
        # test remain_all_checkpoint
        fleet.save_checkpoint(exe,
                              dir_path,
                              trainer_id=0,
                              train_status=status,
                              fs=fs,
                              remain_all_checkpoint=False)

        # can't save under a file
        fs = LocalFS()
        cache_path = "./.load_cache"
        fs.touch(cache_path)
        try:
            fleet.save_checkpoint(exe,
                                  dir_path,
                                  trainer_id=0,
                                  train_status=status,
                                  fs=fs,
                                  cache_path=cache_path)
            self.assertFalse(True)
        except:
            pass

        # can't load under a file
        try:
            fleet.load_checkpoint(exe,
                                  dir_path,
                                  trainer_id=0,
                                  train_status=status2,
                                  fs=fs,
                                  cache_path=cache_path)
            self.assertFalse(True)
        except:
            pass
        fs.delete(cache_path)
예제 #28
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0  # best top1 acc record

    if not config.get('use_ema'):
        train_dataloader, train_fetchs = program.build(config,
                                                       train_prog,
                                                       startup_prog,
                                                       is_train=True)
    else:
        train_dataloader, train_fetchs, ema = program.build(config,
                                                            train_prog,
                                                            startup_prog,
                                                            is_train=True)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(config,
                                                       valid_prog,
                                                       startup_prog,
                                                       is_train=False)
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(startup_prog)

    # load model from 1. checkpoint to resume training, 2. pretrained model to finetune
    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, place)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, place)
        compiled_valid_prog = program.compile(config, valid_prog)

    compiled_train_prog = fleet.main_program
    vdl_writer = LogWriter(args.vdl_dir) if args.vdl_dir else None

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', vdl_writer)
        if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                if config.get('use_ema'):
                    logger.info(logger.coloring("EMA validate start..."))
                    with train_fetchs('ema').apply(exe):
                        top1_acc = program.run(valid_dataloader, exe,
                                               compiled_valid_prog,
                                               valid_fetchs, epoch_id, 'valid')
                    logger.info(logger.coloring("EMA validate over!"))

                top1_acc = program.run(valid_dataloader, exe,
                                       compiled_valid_prog, valid_fetchs,
                                       epoch_id, 'valid')
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                        best_top1_acc, epoch_id)
                    logger.info("{:s}".format(logger.coloring(message, "RED")))
                    if epoch_id % config.save_interval == 0:

                        model_path = os.path.join(config.model_save_dir,
                                                  config.ARCHITECTURE["name"])
                        save_model(train_prog, model_path,
                                   "best_model_in_epoch_" + str(epoch_id))

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(train_prog, model_path, epoch_id)
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    logging.info('Load data ...')
    if len(args.dataset.split(',')) > 1:
        # for large pretraining dataset, ZINC15 and ChEMBL
        # directly load the processed npz files
        train_data_list = []
        for ds in args.dataset.split(','):
            # use processed data.npz
            train_data_list.extend(
                load_data(os.path.join(args.root, ds, 'processed')))
            # dataset = MoleculeDataset(
            #     args.root, ds,
            #     add_symmetry=False,
            #     add_self_loop=False)
            # data_list = dataset.get_data_list()
            # processed_dir = os.path.join(args.root, ds, 'processed')
            # os.makedirs(processed_dir, exist_ok=True)
            # save_data_list_to_npz(
            #     data_list, os.path.join(processed_dir, 'data.npz'))

            # logging.info('Processed {}'.format(ds))
            # train_data_list.extend(data_list)
    else:
        if args.dataset == 'mutag':
            train_data_list, _ = load_mutag_dataset(
                os.path.join(args.root, args.dataset, 'raw'))
        elif args.dataset == 'ptc_mr':
            train_data_list, _ = load_ptc_mr_dataset(
                os.path.join(args.root, args.dataset, 'raw'))
        else:
            raise ValueError('Unsupported dataset')

    if args.is_fleet:
        train_data_list = [
            x for i, x in enumerate(train_data_list)
            if i % fleet.worker_num() == fleet.worker_index()
        ]
    logging.info("Data loaded.")
    logging.info("Train Examples: %s" % len(train_data_list))
    sys.stdout.flush()

    if args.emb_dir is not None:
        os.makedirs(args.emb_dir, exist_ok=True)

    train_prog = F.Program()
    test_prog = F.Program()
    startup_prog = F.Program()
    with F.program_guard(train_prog, startup_prog):
        with F.unique_name.guard():
            agent = create_model(args, config)
            test_prog = train_prog.clone(for_test=True)

            opt = F.optimizer.Adam(learning_rate=args.lr)
            if args.is_fleet:
                dist_strategy = DistributedStrategy()
                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                fleet.init(role)
                opt = fleet.distributed_optimizer(opt, strategy=dist_strategy)
            opt.minimize(agent.loss)

    place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace()
    exe = F.Executor(place)
    exe.run(startup_prog)

    if (not args.dont_save_emb) and \
       (not args.is_fleet or fleet.worker_index() == 0):
        save_embedding(args, exe, test_prog, agent, train_data_list, -1)

    for epoch_id in range(args.max_epoch):
        train(args, exe, train_prog, agent, train_data_list, epoch_id)
        if not args.is_fleet or fleet.worker_index() == 0:
            F.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id),
                             train_prog)
            if not args.dont_save_emb:
                save_embedding(args, exe, test_prog, agent, train_data_list,
                               epoch_id)
예제 #30
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path,
                                      label_map_config=args.label_map_config,
                                      max_seq_len=args.max_seq_len,
                                      total_num=args.train_data_size,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed,
                                      tokenizer=args.tokenizer,
                                      for_cn=args.for_cn,
                                      task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.predict_batch_size == None:
        args.predict_batch_size = args.batch_size

    if args.do_train:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        dev_count = fleet.worker_num()

        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=1,
            trainer_id=fleet.worker_index(),
            trainer_num=fleet.worker_num(),
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        # use fleet api
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        if args.is_distributed:
            exec_strategy.num_threads = 3

        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 1
        if args.is_distributed:
            dist_strategy.nccl_comm_num = 2
        dist_strategy.use_hierarchical_allreduce = True

        if args.use_mix_precision:
            dist_strategy.use_amp = True

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio,
                    dist_strategy=dist_strategy)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_prediction=True)

        test_prog = test_prog.clone(for_test=True)

    train_program = fleet.main_program

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        train_exe = exe
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    #    if args.do_val or args.do_test:
    #        if args.use_multi_gpu_test:
    #            test_exe = fluid.ParallelExecutor(
    #                use_cuda=args.use_cuda,
    #                main_program=test_prog,
    #                share_vars_from=train_exe)

    current_epoch = 0
    steps = 0
    if args.do_train:
        train_pyreader.start()
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        while True:
            try:
                steps += 1
                #                log.info("step: %d" % steps)

                if fleet.worker_index() != 0:
                    train_exe.run(fetch_list=[], program=train_program)
                    continue

                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[], program=train_program)

                else:
                    outputs = evaluate(train_exe,
                                       train_program,
                                       train_pyreader,
                                       graph_vars,
                                       "train",
                                       metric=args.metric)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "ave acc: %f, speed: %f steps/s" %
                        (current_epoch, current_example * dev_count,
                         num_train_examples, steps, outputs["loss"],
                         outputs["accuracy"], args.skip_steps / used_time))
                    ce_info.append(
                        [outputs["loss"], outputs["accuracy"], used_time])

                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path,
                                               fleet._origin_program)


#                if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate_wrapper(args, reader, exe, test_prog,
                                         test_pyreader, graph_vars,
                                         current_epoch, steps)

                    if args.do_test:
                        predict_wrapper(args, reader, exe, test_prog,
                                        test_pyreader, graph_vars,
                                        current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path,
                                           fleet._origin_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
                         graph_vars, current_epoch, steps)

    # final eval on test set
    if args.do_test:
        predict_wrapper(args, reader, exe, test_prog, test_pyreader,
                        graph_vars, current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(args.diagnostic,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        log.info("Final diagnostic")
        qids, preds, probs = predict(test_exe, test_prog, test_pyreader,
                                     graph_vars)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))