def train(args): """Train model Args: args: all arguments. """ startup_prog = fluid.Program() train_prog = fluid.Program() train_out = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) train_data_loader = train_out[-1] if args.use_ema: train_fetch_vars = train_out[:-2] ema = train_out[-2] else: train_fetch_vars = train_out[:-1] train_fetch_list = [var.name for var in train_fetch_vars] if args.validate: test_prog = fluid.Program() test_out = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_data_loader = test_out[-1] test_fetch_vars = test_out[:-1] test_fetch_list = [var.name for var in test_fetch_vars] #Create test_prog and set layers' is_test params to True test_prog = test_prog.clone(for_test=True) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) #init model by checkpoint or pretrianed model. init_model(exe, args, train_prog) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if args.use_dali: import dali train_iter = dali.train(settings=args) if trainer_id == 0: test_iter = dali.val(settings=args) else: imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None) train_reader = imagenet_reader.train(settings=args) if args.use_gpu: if num_trainers <= 1: places = fluid.framework.cuda_places() else: places = place else: if num_trainers <= 1: places = fluid.framework.cpu_places() else: places = place train_data_loader.set_sample_list_generator(train_reader, places) if args.validate: test_reader = imagenet_reader.val(settings=args) test_data_loader.set_sample_list_generator(test_reader, places) compiled_train_prog = best_strategy_compiled(args, train_prog, train_fetch_vars[0], exe) #NOTE: this for benchmark total_batch_num = 0 for pass_id in range(args.num_epochs): if num_trainers > 1 and not args.use_dali: imagenet_reader.set_shuffle_seed(pass_id + ( args.random_seed if args.random_seed else 0)) train_batch_id = 0 train_batch_time_record = [] train_batch_metrics_record = [] if not args.use_dali: train_iter = train_data_loader() if args.validate: test_iter = test_data_loader() t1 = time.time() for batch in train_iter: #NOTE: this is for benchmark if args.max_iter and total_batch_num == args.max_iter: return train_batch_metrics = exe.run(compiled_train_prog, feed=batch, fetch_list=train_fetch_list) t2 = time.time() train_batch_elapse = t2 - t1 train_batch_time_record.append(train_batch_elapse) train_batch_metrics_avg = np.mean( np.array(train_batch_metrics), axis=1) train_batch_metrics_record.append(train_batch_metrics_avg) if trainer_id == 0: print_info("batch", train_batch_metrics_avg, train_batch_elapse, pass_id, train_batch_id, args.print_step) sys.stdout.flush() train_batch_id += 1 t1 = time.time() #NOTE: this for benchmark profiler total_batch_num = total_batch_num + 1 if args.is_profiler and pass_id == 0 and train_batch_id == args.print_step: profiler.start_profiler("All") elif args.is_profiler and pass_id == 0 and train_batch_id == args.print_step + 5: profiler.stop_profiler("total", args.profiler_path) return if args.use_dali: train_iter.reset() if trainer_id == 0 and args.validate: if args.use_ema: logger.info('ExponentialMovingAverage validate start...') with ema.apply(exe): validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record, compiled_train_prog) logger.info('ExponentialMovingAverage validate over!') validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record, train_batch_time_record, compiled_train_prog) if args.use_dali: test_iter.reset() if pass_id % args.save_step == 0: save_model(args, exe, train_prog, pass_id)
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir use_mixup = args.use_mixup use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = args.num_threads exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.enable_inplace = args.with_inplace if not args.fuse: dist_strategy.fuse_all_reduce_ops = False dist_strategy.nccl_comm_num = args.nccl_comm_num dist_strategy.fuse_elewise_add_act_ops=args.fuse_elewise_add_act_ops role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) b_out = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args, dist_strategy=dist_strategy, data_layout=args.data_format) if use_mixup: train_data_loader, train_cost, global_lr = b_out[0], b_out[1], b_out[2] train_fetch_vars = [train_cost, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) else: train_data_loader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4] train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) train_prog = fleet.main_program b_out_test = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args, dist_strategy=dist_strategy, data_layout=args.data_format) test_data_loader, test_cost, test_acc1, test_acc5 = b_out_test[0],b_out_test[1],b_out_test[2],b_out_test[3] test_prog = test_prog.clone(for_test=True) test_prog = compiler.CompiledProgram(test_prog).with_data_parallel(loss_name=test_cost.name, exec_strategy=exec_strategy) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size print("train_batch_size: %d device_num:%d" % (train_batch_size, device_num)) test_batch_size = args.batch_size # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None if args.use_dali: import dali train_iter = dali.train(settings=args, trainer_id=trainer_id, trainers_num=num_trainers, gpu_id=gpu_id, data_layout=args.data_format) else: train_reader = reader.train(settings=args, data_dir=args.data_dir, pass_id_as_seed=shuffle_seed, data_layout=args.data_format, threads=10) train_batch_reader=paddle.batch(train_reader, batch_size=train_batch_size) test_reader = reader.val(settings=args, data_dir=args.data_dir, data_layout=args.data_format, threads=10) test_batch_reader=paddle.batch(test_reader, batch_size=test_batch_size) places = place if num_trainers <= 1 and args.use_gpu: places = fluid.framework.cuda_places() train_data_loader.set_sample_list_generator(train_batch_reader, places) test_data_loader.set_sample_list_generator(test_batch_reader, place) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) train_exe = exe params = models.__dict__[args.model]().params train_speed_list = [] acc1_logs = [] acc5_logs = [] for pass_id in range(params["num_epochs"]): train_info = [[], [], []] test_info = [[], [], []] train_begin=time.time() batch_id = 0 time_record=[] if not args.use_dali: train_iter = train_data_loader() for data in train_iter: t1 = time.time() if batch_id % args.fetch_steps != 0: train_exe.run(train_prog, feed=data) else: if use_mixup: loss, lr = train_exe.run(train_prog, feed=data, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run(train_prog, feed=data, fetch_list=train_fetch_list) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[1].append(acc1) train_info[2].append(acc5) t2 = time.time() period = t2 - t1 time_record.append(period) if args.profile and batch_id == 100: print("begin profiler") if trainer_id == 0: profiler.start_profiler("All") elif args.profile and batch_id == 105: print("begin to end profiler") if trainer_id == 0: profiler.stop_profiler("total", "./profile_pass_%d" % (pass_id)) print("end profiler break!") args.profile=False if batch_id % args.fetch_steps == 0: loss = np.mean(np.array(loss)) train_info[0].append(loss) lr = np.mean(np.array(lr)) period = np.mean(time_record) speed = args.batch_size * 1.0 / period time_record=[] if use_mixup: print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}, speed {5}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.4f sec" % period, "%.2f" % speed)) else: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}, speed {7}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.4f sec" % period, "%.2f" % speed)) sys.stdout.flush() batch_id += 1 if args.use_dali: train_iter.reset() train_loss = np.array(train_info[0]).mean() if not use_mixup: train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_end=time.time() train_speed = (batch_id * train_batch_size) / (train_end - train_begin) train_speed_list.append(train_speed) if trainer_id == 0 and (args.do_test or (pass_id + 1) == params["num_epochs"]): if args.use_dali: test_iter = dali.val(settings=args, trainer_id=trainer_id, trainers_num=num_trainers, gpu_id=gpu_id, data_layout=args.data_format) else: test_iter = test_data_loader() test_batch_id = 0 for data in test_iter: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, feed=data, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: test_speed = test_batch_size * 1.0 / period print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5},speed {6}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period, "%.2f" % test_speed)) sys.stdout.flush() test_batch_id += 1 if args.use_dali: test_iter.reset() del test_iter test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() acc1_logs.append(test_acc1) acc5_logs.append(test_acc5) if use_mixup: print("End pass {0}, train_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}, speed {5}".format( pass_id, "%.5f"%train_loss, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5, "%.2f" % train_speed)) else: print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}, speed {7}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5, "%.2f" % train_speed)) else: if use_mixup: print("End pass {0}, train_loss {1}, speed {2}".format(pass_id, "%.5f"%train_loss, "%.2f" % train_speed)) else: print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, ""speed {4}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.2f" % train_speed)) sys.stdout.flush() # save in last epoch if trainer_id == 0: model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=fleet._origin_program) if args.benchmark_test: if not os.path.isdir("./benchmark_logs/"): os.makedirs("./benchmark_logs/") with open("./benchmark_logs/log_%d" % trainer_id, 'w') as f: result = dict() result['0'] = dict() result['0']['acc1'] = test_acc1 result['0']['acc5'] = test_acc5 result['0']['result_log'] = dict() result['0']['result_log']['acc1'] = acc1_logs result['0']['result_log']['acc5'] = acc5_logs # maximum speed of all epochs result['1'] = max(train_speed_list) * num_trainers result['14'] = args.batch_size print(str(result)) f.writelines(str(result))
def main(args): config = get_config(args.config, overrides=args.override, show=True) if config.get("is_distributed", True): fleet.init(is_collective=True) # assign the place use_gpu = config.get("use_gpu", True) # amp related config use_amp = config.get('use_amp', False) use_pure_fp16 = config.get('use_pure_fp16', False) if use_amp or use_pure_fp16: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_exhaustive_search': 1, 'FLAGS_conv_workspace_size_limit': 4000, 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1' paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) use_xpu = config.get("use_xpu", False) assert ( use_gpu and use_xpu ) is not True, "gpu and xpu can not be true in the same time in static mode!" if use_gpu: place = paddle.set_device('gpu') elif use_xpu: place = paddle.set_device('xpu') else: place = paddle.set_device('cpu') # startup_prog is used to do some parameter init work, # and train prog is used to hold the network startup_prog = paddle.static.Program() train_prog = paddle.static.Program() best_top1_acc = 0.0 # best top1 acc record train_fetchs, lr_scheduler, train_feeds = program.build( config, train_prog, startup_prog, is_train=True, is_distributed=config.get("is_distributed", True)) if config.validate: valid_prog = paddle.static.Program() valid_fetchs, _, valid_feeds = program.build(config, valid_prog, startup_prog, is_train=False, is_distributed=config.get( "is_distributed", True)) # clone to prune some content which is irrelevant in valid_prog valid_prog = valid_prog.clone(for_test=True) # create the "Executor" with the statement of which place exe = paddle.static.Executor(place) # Parameter initialization exe.run(startup_prog) if config.get("use_pure_fp16", False): cast_parameters_to_fp16(place, train_prog, fluid.global_scope()) # load pretrained models or checkpoints init_model(config, train_prog, exe) if not config.get("is_distributed", True): compiled_train_prog = program.compile( config, train_prog, loss_name=train_fetchs["loss"][0].name) else: compiled_train_prog = train_prog if not config.get('use_dali', False): train_dataloader = Reader(config, 'train', places=place)() if config.validate and paddle.distributed.get_rank() == 0: valid_dataloader = Reader(config, 'valid', places=place)() if use_xpu: compiled_valid_prog = valid_prog else: compiled_valid_prog = program.compile(config, valid_prog) else: assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!" import dali train_dataloader = dali.train(config) if config.validate and paddle.distributed.get_rank() == 0: valid_dataloader = dali.val(config) compiled_valid_prog = program.compile(config, valid_prog) vdl_writer = None if args.vdl_dir: if version_info.major == 2: logger.info( "visualdl is just supported for python3, so it is disabled in python2..." ) else: from visualdl import LogWriter vdl_writer = LogWriter(args.vdl_dir) for epoch_id in range(config.epochs): # 1. train with train dataset program.run(train_dataloader, exe, compiled_train_prog, train_feeds, train_fetchs, epoch_id, 'train', config, vdl_writer, lr_scheduler) if paddle.distributed.get_rank() == 0: # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog, valid_feeds, valid_fetchs, epoch_id, 'valid', config) if top1_acc > best_top1_acc: best_top1_acc = top1_acc message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, epoch_id) logger.info("{:s}".format(logger.coloring(message, "RED"))) if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, "best_model") # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, epoch_id)
def main(args): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) config = get_config(args.config, overrides=args.override, show=True) use_fp16 = config.get('use_fp16', False) if use_fp16: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_exhaustive_search': 1, 'FLAGS_conv_workspace_size_limit': 4000, 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1' paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) # assign the place gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) # startup_prog is used to do some parameter init work, # and train prog is used to hold the network startup_prog = fluid.Program() train_prog = fluid.Program() best_top1_acc = 0.0 # best top1 acc record if not config.get('use_ema'): train_dataloader, train_fetchs = program.build( config, train_prog, startup_prog, is_train=True) else: train_dataloader, train_fetchs, ema = program.build( config, train_prog, startup_prog, is_train=True) if config.validate: valid_prog = fluid.Program() valid_dataloader, valid_fetchs = program.build( config, valid_prog, startup_prog, is_train=False) # clone to prune some content which is irrelevant in valid_prog valid_prog = valid_prog.clone(for_test=True) # create the "Executor" with the statement of which place exe = fluid.Executor(place) # Parameter initialization exe.run(startup_prog) # load model from 1. checkpoint to resume training, 2. pretrained model to finetune init_model(config, train_prog, exe) if not config.get('use_dali', False): train_reader = Reader(config, 'train')() train_dataloader.set_sample_list_generator(train_reader, place) if config.validate: valid_reader = Reader(config, 'valid')() valid_dataloader.set_sample_list_generator(valid_reader, place) compiled_valid_prog = program.compile(config, valid_prog) else: import dali train_dataloader = dali.train(config) if config.validate and int(os.getenv("PADDLE_TRAINER_ID", 0)): if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: valid_dataloader = dali.val(config) compiled_valid_prog = program.compile(config, valid_prog) compiled_train_prog = fleet.main_program vdl_writer = None if args.vdl_dir: if version_info.major == 2: logger.info( "visualdl is just supported for python3, so it is disabled in python2..." ) else: from visualdl import LogWriter vdl_writer = LogWriter(args.vdl_dir) for epoch_id in range(config.epochs): # 1. train with train dataset program.run(train_dataloader, exe, compiled_train_prog, train_fetchs, epoch_id, 'train', config, vdl_writer) if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: if config.get('use_ema'): logger.info(logger.coloring("EMA validate start...")) with ema.apply(exe): top1_acc = program.run( valid_dataloader, exe, compiled_valid_prog, valid_fetchs, epoch_id, 'valid', config) logger.info(logger.coloring("EMA validate over!")) top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog, valid_fetchs, epoch_id, 'valid', config) if top1_acc > best_top1_acc: best_top1_acc = top1_acc message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, epoch_id) logger.info("{:s}".format(logger.coloring(message, "RED"))) if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, "best_model") # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, epoch_id)