예제 #1
0
def train(args):
    """Train model
    
    Args:
        args: all arguments.    
    """
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    train_out = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    train_data_loader = train_out[-1]
    if args.use_ema:
        train_fetch_vars = train_out[:-2]
        ema = train_out[-2]
    else:
        train_fetch_vars = train_out[:-1]

    train_fetch_list = [var.name for var in train_fetch_vars]

    if args.validate:
        test_prog = fluid.Program()
        test_out = build_program(
            is_train=False,
            main_prog=test_prog,
            startup_prog=startup_prog,
            args=args)
        test_data_loader = test_out[-1]
        test_fetch_vars = test_out[:-1]

        test_fetch_list = [var.name for var in test_fetch_vars]

        #Create test_prog and set layers' is_test params to True
        test_prog = test_prog.clone(for_test=True)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))

    #init model by checkpoint or pretrianed model.
    init_model(exe, args, train_prog)
    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    if args.use_dali:
        import dali
        train_iter = dali.train(settings=args)
        if trainer_id == 0:
            test_iter = dali.val(settings=args)
    else:
        imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None)
        train_reader = imagenet_reader.train(settings=args)
        if args.use_gpu:
            if num_trainers <= 1:
                places = fluid.framework.cuda_places()
            else:
                places = place
        else:
            if num_trainers <= 1:
                places = fluid.framework.cpu_places()
            else:
                places = place

        train_data_loader.set_sample_list_generator(train_reader, places)

        if args.validate:
            test_reader = imagenet_reader.val(settings=args)
            test_data_loader.set_sample_list_generator(test_reader, places)

    compiled_train_prog = best_strategy_compiled(args, train_prog,
                                                 train_fetch_vars[0], exe)
    #NOTE: this for benchmark
    total_batch_num = 0
    for pass_id in range(args.num_epochs):
        if num_trainers > 1 and not args.use_dali:
            imagenet_reader.set_shuffle_seed(pass_id + (
                args.random_seed if args.random_seed else 0))
        train_batch_id = 0
        train_batch_time_record = []
        train_batch_metrics_record = []

        if not args.use_dali:
            train_iter = train_data_loader()
            if args.validate:
                test_iter = test_data_loader()

        t1 = time.time()
        for batch in train_iter:
            #NOTE: this is for benchmark
            if args.max_iter and total_batch_num == args.max_iter:
                return
            train_batch_metrics = exe.run(compiled_train_prog,
                                          feed=batch,
                                          fetch_list=train_fetch_list)
            t2 = time.time()
            train_batch_elapse = t2 - t1
            train_batch_time_record.append(train_batch_elapse)

            train_batch_metrics_avg = np.mean(
                np.array(train_batch_metrics), axis=1)
            train_batch_metrics_record.append(train_batch_metrics_avg)
            if trainer_id == 0:
                print_info("batch", train_batch_metrics_avg, train_batch_elapse,
                           pass_id, train_batch_id, args.print_step)
                sys.stdout.flush()
            train_batch_id += 1
            t1 = time.time()
            #NOTE: this for benchmark profiler
            total_batch_num = total_batch_num + 1
            if args.is_profiler and pass_id == 0 and train_batch_id == args.print_step:
                profiler.start_profiler("All")
            elif args.is_profiler and pass_id == 0 and train_batch_id == args.print_step + 5:
                profiler.stop_profiler("total", args.profiler_path)
                return

        if args.use_dali:
            train_iter.reset()

        if trainer_id == 0 and args.validate:
            if args.use_ema:
                logger.info('ExponentialMovingAverage validate start...')
                with ema.apply(exe):
                    validate(args, test_iter, exe, test_prog, test_fetch_list,
                             pass_id, train_batch_metrics_record,
                             compiled_train_prog)
                logger.info('ExponentialMovingAverage validate over!')

            validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id,
                     train_batch_metrics_record, train_batch_time_record,
                     compiled_train_prog)

            if args.use_dali:
                test_iter.reset()

        if pass_id % args.save_step == 0:
            save_model(args, exe, train_prog, pass_id)
예제 #2
0
def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir
    use_mixup = args.use_mixup
    use_ngraph = os.getenv('FLAGS_use_ngraph')

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = args.num_threads
    exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

    dist_strategy = DistributedStrategy()
    dist_strategy.exec_strategy = exec_strategy
    dist_strategy.enable_inplace = args.with_inplace
    if not args.fuse:
        dist_strategy.fuse_all_reduce_ops = False
    dist_strategy.nccl_comm_num = args.nccl_comm_num
    dist_strategy.fuse_elewise_add_act_ops=args.fuse_elewise_add_act_ops

    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    b_out = build_program(
                     is_train=True,
                     main_prog=train_prog,
                     startup_prog=startup_prog,
                     args=args,
                     dist_strategy=dist_strategy,
                     data_layout=args.data_format)
    if use_mixup:
        train_data_loader, train_cost, global_lr = b_out[0], b_out[1], b_out[2]
        train_fetch_vars = [train_cost, global_lr]
        train_fetch_list = []
        for var in train_fetch_vars:
            var.persistable=True
            train_fetch_list.append(var.name)

    else:
        train_data_loader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4]
        train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr]
        train_fetch_list = []
        for var in train_fetch_vars:
            var.persistable=True
            train_fetch_list.append(var.name)

    train_prog = fleet.main_program

    b_out_test = build_program(
                     is_train=False,
                     main_prog=test_prog,
                     startup_prog=startup_prog,
                     args=args,
                     dist_strategy=dist_strategy,
                     data_layout=args.data_format)
    test_data_loader, test_cost, test_acc1, test_acc5 = b_out_test[0],b_out_test[1],b_out_test[2],b_out_test[3]

    test_prog = test_prog.clone(for_test=True)
    test_prog = compiler.CompiledProgram(test_prog).with_data_parallel(loss_name=test_cost.name, exec_strategy=exec_strategy)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)

    if args.use_gpu:
        device_num = get_device_num()
    else:
        device_num = 1

    train_batch_size = args.batch_size
    print("train_batch_size: %d device_num:%d" % (train_batch_size, device_num))

    test_batch_size = args.batch_size
    # NOTE: the order of batch data generated by batch_reader
    # must be the same in the respective processes.
    shuffle_seed = 1 if num_trainers > 1 else None

    if args.use_dali:
        import dali
        train_iter = dali.train(settings=args, trainer_id=trainer_id, trainers_num=num_trainers,
                                gpu_id=gpu_id, data_layout=args.data_format)
    else:
        train_reader = reader.train(settings=args, data_dir=args.data_dir,
                                    pass_id_as_seed=shuffle_seed, data_layout=args.data_format, threads=10)
        train_batch_reader=paddle.batch(train_reader, batch_size=train_batch_size)

        test_reader = reader.val(settings=args, data_dir=args.data_dir, data_layout=args.data_format, threads=10)
        test_batch_reader=paddle.batch(test_reader, batch_size=test_batch_size)

        places = place
        if num_trainers <= 1 and args.use_gpu:
            places = fluid.framework.cuda_places()

        train_data_loader.set_sample_list_generator(train_batch_reader, places)
        test_data_loader.set_sample_list_generator(test_batch_reader, place)

    test_fetch_vars = [test_cost, test_acc1, test_acc5]
    test_fetch_list = []
    for var in test_fetch_vars:
        var.persistable=True
        test_fetch_list.append(var.name)

    train_exe = exe

    params = models.__dict__[args.model]().params

    train_speed_list = []
    acc1_logs = []
    acc5_logs = []
    for pass_id in range(params["num_epochs"]):
        train_info = [[], [], []]
        test_info = [[], [], []]
        train_begin=time.time()
        batch_id = 0
        time_record=[]

        if not args.use_dali:
            train_iter = train_data_loader()

        for data in train_iter:
            t1 = time.time()

            if batch_id % args.fetch_steps != 0:
                train_exe.run(train_prog, feed=data)
            else:
                if use_mixup:
                    loss, lr = train_exe.run(train_prog, feed=data, fetch_list=train_fetch_list)
                else:
                    loss, acc1, acc5, lr = train_exe.run(train_prog,  feed=data,  fetch_list=train_fetch_list)
                    acc1 = np.mean(np.array(acc1))
                    acc5 = np.mean(np.array(acc5))
                    train_info[1].append(acc1)
                    train_info[2].append(acc5)

            t2 = time.time()
            period = t2 - t1
            time_record.append(period)

            if args.profile and batch_id == 100:
                print("begin profiler")
                if trainer_id == 0:
                    profiler.start_profiler("All")
            elif args.profile and batch_id == 105:
                print("begin to end profiler")
                if trainer_id == 0:
                    profiler.stop_profiler("total", "./profile_pass_%d" % (pass_id))
                print("end profiler break!")
                args.profile=False

            if batch_id % args.fetch_steps == 0:
                loss = np.mean(np.array(loss))
                train_info[0].append(loss)
                lr = np.mean(np.array(lr))
                period = np.mean(time_record)
                speed = args.batch_size * 1.0 / period
                time_record=[]
                if use_mixup:
                    print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}, speed {5}"
                          .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.4f sec" % period, "%.2f" % speed))
                else:
                    print("Pass {0}, trainbatch {1}, loss {2}, \
                        acc1 {3}, acc5 {4}, lr {5}, time {6}, speed {7}"
                          .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" %
                                  lr, "%2.4f sec" % period, "%.2f" % speed))
                sys.stdout.flush()
            batch_id += 1

        if args.use_dali:
            train_iter.reset()

        train_loss = np.array(train_info[0]).mean()
        if not use_mixup:
            train_acc1 = np.array(train_info[1]).mean()
            train_acc5 = np.array(train_info[2]).mean()
        train_end=time.time()
        train_speed = (batch_id * train_batch_size) / (train_end - train_begin)
        train_speed_list.append(train_speed)

        if trainer_id == 0 and (args.do_test or (pass_id + 1) == params["num_epochs"]):
            if args.use_dali:
                test_iter = dali.val(settings=args, trainer_id=trainer_id, trainers_num=num_trainers,
                                 gpu_id=gpu_id, data_layout=args.data_format)
            else:
                test_iter = test_data_loader()

            test_batch_id = 0
            for data in test_iter:
                t1 = time.time()
                loss, acc1, acc5 = exe.run(program=test_prog,
                                           feed=data,
                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
                test_info[0].append(loss)
                test_info[1].append(acc1)
                test_info[2].append(acc5)

                if test_batch_id % 10 == 0:
                    test_speed = test_batch_size * 1.0 / period
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5},speed {6}"
                        .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5,
                                "%2.2f sec" % period, "%.2f" % test_speed))
                    sys.stdout.flush()
                test_batch_id += 1

            if args.use_dali:
                test_iter.reset()
                del test_iter

            test_loss = np.array(test_info[0]).mean()
            test_acc1 = np.array(test_info[1]).mean()
            test_acc5 = np.array(test_info[2]).mean()

            acc1_logs.append(test_acc1)
            acc5_logs.append(test_acc5)

            if use_mixup:
                print("End pass {0}, train_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}, speed {5}".format(
                      pass_id, "%.5f"%train_loss, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5,
                      "%.2f" % train_speed))
            else:
                print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
                  "test_loss {4}, test_acc1 {5}, test_acc5 {6}, speed {7}".format(
                      pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss,
                      "%.5f"%test_acc1, "%.5f"%test_acc5, "%.2f" % train_speed))
        else:
            if use_mixup:
                print("End pass {0}, train_loss {1}, speed {2}".format(pass_id, "%.5f"%train_loss, "%.2f" % train_speed))
            else:
                print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, ""speed {4}".format(
                    pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.2f" % train_speed))

        sys.stdout.flush()

    # save in last epoch
    if trainer_id == 0:
        model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id))
        if not os.path.isdir(model_path):
            os.makedirs(model_path)

        fluid.io.save_persistables(exe, model_path, main_program=fleet._origin_program)
        if args.benchmark_test:
            if not os.path.isdir("./benchmark_logs/"):
                os.makedirs("./benchmark_logs/")
            with open("./benchmark_logs/log_%d" % trainer_id, 'w') as f:
                result = dict()
                result['0'] = dict()
                result['0']['acc1'] = test_acc1
                result['0']['acc5'] = test_acc5
                result['0']['result_log'] = dict()
                result['0']['result_log']['acc1'] = acc1_logs
                result['0']['result_log']['acc5'] = acc5_logs
                # maximum speed of all epochs
                result['1'] = max(train_speed_list) * num_trainers
                result['14'] = args.batch_size

                print(str(result))
                f.writelines(str(result))
예제 #3
0
def main(args):
    config = get_config(args.config, overrides=args.override, show=True)
    if config.get("is_distributed", True):
        fleet.init(is_collective=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    # amp related config
    use_amp = config.get('use_amp', False)
    use_pure_fp16 = config.get('use_pure_fp16', False)
    if use_amp or use_pure_fp16:
        AMP_RELATED_FLAGS_SETTING = {
            'FLAGS_cudnn_exhaustive_search': 1,
            'FLAGS_conv_workspace_size_limit': 4000,
            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
            'FLAGS_max_inplace_grad_add': 8,
        }
        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
    use_xpu = config.get("use_xpu", False)
    assert (
        use_gpu and use_xpu
    ) is not True, "gpu and xpu can not be true in the same time in static mode!"

    if use_gpu:
        place = paddle.set_device('gpu')
    elif use_xpu:
        place = paddle.set_device('xpu')
    else:
        place = paddle.set_device('cpu')

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = paddle.static.Program()
    train_prog = paddle.static.Program()

    best_top1_acc = 0.0  # best top1 acc record

    train_fetchs, lr_scheduler, train_feeds = program.build(
        config,
        train_prog,
        startup_prog,
        is_train=True,
        is_distributed=config.get("is_distributed", True))

    if config.validate:
        valid_prog = paddle.static.Program()
        valid_fetchs, _, valid_feeds = program.build(config,
                                                     valid_prog,
                                                     startup_prog,
                                                     is_train=False,
                                                     is_distributed=config.get(
                                                         "is_distributed",
                                                         True))
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = paddle.static.Executor(place)
    # Parameter initialization
    exe.run(startup_prog)
    if config.get("use_pure_fp16", False):
        cast_parameters_to_fp16(place, train_prog, fluid.global_scope())
    # load pretrained models or checkpoints
    init_model(config, train_prog, exe)

    if not config.get("is_distributed", True):
        compiled_train_prog = program.compile(
            config, train_prog, loss_name=train_fetchs["loss"][0].name)
    else:
        compiled_train_prog = train_prog

    if not config.get('use_dali', False):
        train_dataloader = Reader(config, 'train', places=place)()
        if config.validate and paddle.distributed.get_rank() == 0:
            valid_dataloader = Reader(config, 'valid', places=place)()
            if use_xpu:
                compiled_valid_prog = valid_prog
            else:
                compiled_valid_prog = program.compile(config, valid_prog)
    else:
        assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!"
        import dali
        train_dataloader = dali.train(config)
        if config.validate and paddle.distributed.get_rank() == 0:
            valid_dataloader = dali.val(config)
            compiled_valid_prog = program.compile(config, valid_prog)

    vdl_writer = None
    if args.vdl_dir:
        if version_info.major == 2:
            logger.info(
                "visualdl is just supported for python3, so it is disabled in python2..."
            )
        else:
            from visualdl import LogWriter
            vdl_writer = LogWriter(args.vdl_dir)

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                    train_fetchs, epoch_id, 'train', config, vdl_writer,
                    lr_scheduler)
        if paddle.distributed.get_rank() == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                top1_acc = program.run(valid_dataloader, exe,
                                       compiled_valid_prog, valid_feeds,
                                       valid_fetchs, epoch_id, 'valid', config)
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                        best_top1_acc, epoch_id)
                    logger.info("{:s}".format(logger.coloring(message, "RED")))
                    if epoch_id % config.save_interval == 0:

                        model_path = os.path.join(config.model_save_dir,
                                                  config.ARCHITECTURE["name"])
                        save_model(train_prog, model_path, "best_model")

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(train_prog, model_path, epoch_id)
예제 #4
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    use_fp16 = config.get('use_fp16', False)
    if use_fp16:
        AMP_RELATED_FLAGS_SETTING = {
            'FLAGS_cudnn_exhaustive_search': 1,
            'FLAGS_conv_workspace_size_limit': 4000,
            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
            'FLAGS_max_inplace_grad_add': 8,
        }
        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
    # assign the place
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0  # best top1 acc record

    if not config.get('use_ema'):
        train_dataloader, train_fetchs = program.build(
            config, train_prog, startup_prog, is_train=True)
    else:
        train_dataloader, train_fetchs, ema = program.build(
            config, train_prog, startup_prog, is_train=True)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(
            config, valid_prog, startup_prog, is_train=False)
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(startup_prog)

    # load model from 1. checkpoint to resume training, 2. pretrained model to finetune
    init_model(config, train_prog, exe)
    if not config.get('use_dali', False):
        train_reader = Reader(config, 'train')()
        train_dataloader.set_sample_list_generator(train_reader, place)
        if config.validate:
            valid_reader = Reader(config, 'valid')()
            valid_dataloader.set_sample_list_generator(valid_reader, place)
            compiled_valid_prog = program.compile(config, valid_prog)

    else:
        import dali
        train_dataloader = dali.train(config)
        if config.validate and int(os.getenv("PADDLE_TRAINER_ID", 0)):
            if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
                valid_dataloader = dali.val(config)
            compiled_valid_prog = program.compile(config, valid_prog)

    compiled_train_prog = fleet.main_program

    vdl_writer = None
    if args.vdl_dir:
        if version_info.major == 2:
            logger.info(
                "visualdl is just supported for python3, so it is disabled in python2..."
            )
        else:
            from visualdl import LogWriter
            vdl_writer = LogWriter(args.vdl_dir)

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', config, vdl_writer)
        if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                if config.get('use_ema'):
                    logger.info(logger.coloring("EMA validate start..."))
                    with ema.apply(exe):
                        top1_acc = program.run(
                            valid_dataloader, exe, compiled_valid_prog,
                            valid_fetchs, epoch_id, 'valid', config)
                    logger.info(logger.coloring("EMA validate over!"))

                top1_acc = program.run(valid_dataloader, exe,
                                       compiled_valid_prog, valid_fetchs,
                                       epoch_id, 'valid', config)
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                        best_top1_acc, epoch_id)
                    logger.info("{:s}".format(logger.coloring(message, "RED")))
                    if epoch_id % config.save_interval == 0:

                        model_path = os.path.join(config.model_save_dir,
                                                  config.ARCHITECTURE["name"])
                        save_model(train_prog, model_path, "best_model")

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(train_prog, model_path, epoch_id)