def test_open_sync_batch_norm(self): import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy if not fluid.core.is_compiled_with_cuda(): # Operator "gen_nccl_id" has not been registered return data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) optimizer = fluid.optimizer.AdamOptimizer() role = role_maker.UserDefinedCollectiveRoleMaker(0, ['127.0.0.1:6170']) fleet.init(role) dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = True dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) dist_optimizer.minimize(loss) self.assertEqual(dist_strategy.exec_strategy.num_threads, 1)
def optimize(self, loss, optimizer_type, lr): optimizer = F.optimizer.Adam(learning_rate=lr) dist_strategy = DistributedStrategy() dist_strategy.enable_sequential_execution = True optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = optimizer.minimize(loss, F.default_startup_program())
def _build_strategy(self, context): from paddle.fluid.incubate.fleet.collective import DistributedStrategy exec_strategy = fluid.ExecutionStrategy() strategy = DistributedStrategy() strategy.exec_strategy = exec_strategy context["strategy"] = strategy return strategy
def test_amp_strategy(self): optimizer = fluid.optimizer.AdamOptimizer() optimizer = fluid.contrib.mixed_precision.decorate( optimizer, init_loss_scaling=1.0, use_dynamic_loss_scaling=True) dist_strategy = DistributedStrategy() dist_strategy.use_amp = True dist_optimizer = CollectiveOptimizer(optimizer, strategy=dist_strategy) self.assertRaises(ValueError, dist_optimizer.minimize, None)
def test_recompute_strategy(self): optimizer = fluid.optimizer.AdamOptimizer() optimizer = fluid.optimizer.RecomputeOptimizer(optimizer) dist_strategy = DistributedStrategy() dist_strategy.forward_recompute = True dist_strategy.recompute_checkpoints = ["Test"] dist_optimizer = CollectiveOptimizer(optimizer, strategy=dist_strategy) self.assertRaises(ValueError, dist_optimizer.minimize, None)
def run_pipeline_trainer(self, args): self.lr = args.lr dist_strategy = DistributedStrategy() test_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader = \ self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy) device_id = int(os.getenv("FLAGS_selected_gpus", "0")) eprint(type(self).__name__, "device_id: %d." % device_id) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) eprint(type(self).__name__, "run worker startup program done.") data_loader.set_sample_list_generator(train_reader, place) data_loader.start() print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost]) loss = loss[0] if loss else None out_losses.append(loss) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses)) if args.save_model: model_save_dir = "/tmp" if fleet.worker_index() == 0: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer") else: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables_2") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables_2") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer_2") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer_2") fluid.io.save_persistables(exe, model_save_dir_fluid, fleet._origin_program) fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) feeded_var_names = [var.name for var in feed_var_list] fluid.io.save_inference_model(infer_save_dir_fluid, feeded_var_names, [avg_cost], exe, fleet._origin_program) fleet.save_inference_model(exe, infer_save_dir_fleet, feeded_var_names, [avg_cost])
def default_exe_params(is_distributed, use_cuda, thread_num): """ Set the default execute parameters. """ gpu_id = 0 trainer_num = 1 trainer_id = 0 dist_strategy = None places = None if is_distributed: if use_cuda: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainer_num = fleet.worker_num() trainer_id = fleet.worker_index() exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 exec_strategy.num_iteration_per_drop_scope = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 2 dist_strategy.fuse_all_reduce_ops = True dist_strategy.forward_recompute = True dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = 12800.0 places = fluid.cuda_places() else: print('Only gpu is supported for distributed mode at present.') exit(-1) else: if use_cuda: places = fluid.cuda_places() else: places = fluid.cpu_places(thread_num) os.environ['CPU_NUM'] = str(thread_num) if use_cuda: exe = fluid.Executor(fluid.CUDAPlace(gpu_id)) else: exe = fluid.Executor(fluid.CPUPlace()) return { 'exe': exe, 'trainer_num': trainer_num, 'trainer_id': trainer_id, 'gpu_id': gpu_id, 'dist_strategy': dist_strategy, 'places': places }
def setup_optimizer(optimizer, use_cuda, is_distributed): """ Setup the optimizer """ if use_cuda: if is_distributed: dist_strategy = DistributedStrategy() optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
def get_distributed_optimizer(optimizer): """ Get the default collective distributed optimizer under fleet. """ dist_strategy = DistributedStrategy() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def __init__(self, exec_config: List[_ExecuteConfig] = [], dist_config: List[_DistributeConfig] = []): from paddle.fluid import ExecutionStrategy from paddle.fluid.incubate.fleet.collective import DistributedStrategy self._strategy = DistributedStrategy() for conf in dist_config: conf.setup(self._strategy) for conf in exec_config: conf.setup(self._strategy.exec_strategy)
def dist_optimizer(config, optimizer): """ Create a distributed optimizer based on a normal optimizer Args: config(dict): optimizer(): a normal optimizer Returns: optimizer: a distributed optimizer """ exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = 10 dist_strategy = DistributedStrategy() dist_strategy.nccl_comm_num = 1 dist_strategy.fuse_all_reduce_ops = True dist_strategy.exec_strategy = exec_strategy optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def distributed_optimize(optimizer): ''' A part of configuration for distributed training ''' strategy = DistributedStrategy() strategy.fuse_all_reduce_ops = True strategy.nccl_comm_num = 2 strategy.fuse_elewise_add_act_ops=True strategy.fuse_bn_act_ops = True return fleet.distributed_optimizer(optimizer, strategy=strategy)
def set_optimizer(self, FLAGS, net_output): """ set optimizer """ optimizer = net_output['optimizer'] if self.is_multi_gpu(FLAGS): trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM")) trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") logging.info("train_id:%s, num_trainers:%s, trainer_endpoints:%s" % (trainer_id, num_trainers, trainer_endpoints)) trainer_endpoints = trainer_endpoints.split(',') role = role_maker.UserDefinedCollectiveRoleMaker(current_id=trainer_id, worker_endpoints=trainer_endpoints) fleet.init(role) dist_strategy = DistributedStrategy() #num_nodes = len(set([x.split(':')[0] for x in trainer_endpoints])) #if num_nodes == 1: # dist_strategy.use_local_sgd = True #dist_strategy.mode = "collective" #multi node is nccl2 #dist_strategy.collective_mode = "local_sgd" # local_sgd or grad_allreduce # logging.info("use local sgd, not nccl2 for single node.") """ #TODO: dist_strategy.enable_inplace = FLAGS.with_inplace if FLAGS.fuse_ops: dist_strategy.fuse_all_reduce_ops = 1 dist_strategy.nccl_comm_num = FLAGS.nccl_comm_num """ optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer.minimize(net_output['loss'])
def run_gpu_fleet_api_trainer(self, args): assert args.update_method == "nccl2" self.lr = args.lr exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.fuse_memory_size = 1 # MB dist_strategy.fuse_laryer_size = 1 if args.use_local_sgd: dist_strategy.use_local_sgd = True if args.ut4grad_allreduce: dist_strategy._ut4grad_allreduce = True role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) print_to_err("gpu_fleet", "fleet.node_num:") # "fleet.node_id:", fleet.node_id(), # "fleet.trainer_num:", fleet.worker_num()) test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy) trainer_prog = fleet._origin_program dist_prog = fleet.main_program device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) eprint(type(self).__name__, "run worker startup program done.") feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss, = exe.run(dist_prog, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses))
def main(args): with open(args.config, 'r') as f: config = json.load(f) logging.info('Load data ...') if len(args.dataset.split(',')) > 1: # for large pretraining dataset, ZINC15 and ChEMBL # directly load the processed npz files train_data_list = [] for ds in args.dataset.split(','): # use processed data.npz train_data_list.extend( load_data(os.path.join(args.root, ds, 'processed'))) # dataset = MoleculeDataset( # args.root, ds, # add_symmetry=False, # add_self_loop=False) # data_list = dataset.get_data_list() # processed_dir = os.path.join(args.root, ds, 'processed') # os.makedirs(processed_dir, exist_ok=True) # save_data_list_to_npz( # data_list, os.path.join(processed_dir, 'data.npz')) # logging.info('Processed {}'.format(ds)) # train_data_list.extend(data_list) else: if args.dataset == 'mutag': train_data_list, _ = load_mutag_dataset( os.path.join(args.root, args.dataset, 'raw')) elif args.dataset == 'ptc_mr': train_data_list, _ = load_ptc_mr_dataset( os.path.join(args.root, args.dataset, 'raw')) else: raise ValueError('Unsupported dataset') if args.is_fleet: train_data_list = [ x for i, x in enumerate(train_data_list) if i % fleet.worker_num() == fleet.worker_index() ] logging.info("Data loaded.") logging.info("Train Examples: %s" % len(train_data_list)) sys.stdout.flush() if args.emb_dir is not None: os.makedirs(args.emb_dir, exist_ok=True) train_prog = F.Program() test_prog = F.Program() startup_prog = F.Program() with F.program_guard(train_prog, startup_prog): with F.unique_name.guard(): agent = create_model(args, config) test_prog = train_prog.clone(for_test=True) opt = F.optimizer.Adam(learning_rate=args.lr) if args.is_fleet: dist_strategy = DistributedStrategy() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(agent.loss) place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace() exe = F.Executor(place) exe.run(startup_prog) if (not args.dont_save_emb) and \ (not args.is_fleet or fleet.worker_index() == 0): save_embedding(args, exe, test_prog, agent, train_data_list, -1) for epoch_id in range(args.max_epoch): train(args, exe, train_prog, agent, train_data_list, epoch_id) if not args.is_fleet or fleet.worker_index() == 0: F.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) if not args.dont_save_emb: save_embedding(args, exe, test_prog, agent, train_data_list, epoch_id)
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, total_num=args.train_data_size, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fleet.worker_num() train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=1, trainer_id=fleet.worker_index(), trainer_num=fleet.worker_num(), shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # use fleet api exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count if args.is_distributed: exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 if args.is_distributed: dist_strategy.nccl_comm_num = 2 dist_strategy.use_hierarchical_allreduce = True if args.use_mix_precision: dist_strategy.use_amp = True with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_prediction=True) test_prog = test_prog.clone(for_test=True) train_program = fleet.main_program exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: train_exe = exe train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe # if args.do_val or args.do_test: # if args.use_multi_gpu_test: # test_exe = fluid.ParallelExecutor( # use_cuda=args.use_cuda, # main_program=test_prog, # share_vars_from=train_exe) current_epoch = 0 steps = 0 if args.do_train: train_pyreader.start() if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 while True: try: steps += 1 # log.info("step: %d" % steps) if fleet.worker_index() != 0: train_exe.run(fetch_list=[], program=train_program) continue if steps % args.skip_steps != 0: train_exe.run(fetch_list=[], program=train_program) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example * dev_count, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) train_pyreader.reset() break # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) log.info("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() with open(args.task_group_json) as f: task_group = json.load(f) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_amp else 2 exec_strategy.num_iteration_per_drop_scope = min(1, args.skip_steps) node_nums = int(os.getenv("PADDLE_NODES_NUM")) print("args.is_distributed:", args.is_distributed) num_trainers = 1 trainer_id = 0 if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) trainer_id = fleet.worker_index() current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = fleet.worker_endpoints() trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" .format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.remove_unnecessary_lock = False # not useful dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False dist_strategy.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce \ and trainers_num > args.hierarchical_allreduce_inter_nranks: dist_strategy.use_hierarchical_allreduce = args.use_hierarchical_allreduce dist_strategy.hierarchical_allreduce_inter_nranks = \ args.hierarchical_allreduce_inter_nranks assert dist_strategy.use_hierarchical_allreduce > 1 assert trainers_num % dist_strategy.hierarchical_allreduce_inter_nranks == 0 dist_strategy.hierarchical_allreduce_exter_nranks = \ trainers_num / dist_strategy.hierarchical_allreduce_inter_nranks if args.use_amp: dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = args.init_loss_scaling if args.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution=True trainer_id = fleet.worker_index() current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = fleet.worker_endpoints() trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" .format(worker_endpoints,trainers_num, current_endpoint, trainer_id)) else: dist_strategy=None gpu_id=0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed: gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = len(gpus) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d, gpu_id:%d" % (dev_count, gpu_id)) train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, fetch_vars = create_model( pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] checkpoints = fetch_vars["checkpoints"] total_loss = graph_vars[-1] if args.use_recompute: dist_strategy.recompute_checkpoints = checkpoints scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_amp, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) origin_train_program = train_program if args.is_distributed: #raped by fleet, need to assign fleet's modified train_grogram back train_program = fleet.main_program origin_train_program = fleet._origin_program test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, fetch_vars = create_model( pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] total_loss = graph_vars[-1] test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": #init_checkpoint(exe, args.init_checkpoint, origin_train_program, args.use_amp) init_pretraining_params(exe, args.init_checkpoint, origin_train_program, args.use_amp) data_reader = ErnieDataReader( task_group, False, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, hack_old_trainset=args.hack_old_data) #only fleet train_exe = exe predict = predict_wrapper( args, exe, ernie_config, task_group, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[var.name for var in graph_vars]) train_pyreader.set_batch_generator(data_reader.data_generator()) train_pyreader.start() steps = 112000 time_begin = time.time() node_nums = int(os.getenv("PADDLE_NODES_NUM")) while True:#steps < args.num_train_steps: try: steps += 1#node_nums skip_steps = args.skip_steps# * node_nums fetch_list = [] if trainer_id == 0 and steps % skip_steps == 0: fetch_list = [var.name for var in graph_vars] + [scheduled_lr.name] if args.use_amp: fetch_list.append(loss_scaling.name) outputs = train_exe.run(fetch_list=fetch_list, program=train_program) time_end = time.time() used_time = time_end - time_begin if outputs: each_mask_lm_cost, lm_w = outputs[:2] if args.use_amp: each_total_constract_loss, each_total_cost, np_lr, l_scaling = outputs[-4:] else: each_total_constract_loss, each_total_cost, np_lr = outputs[-3:] acc_list =[] index = 2 for task in task_group: each_task_acc = outputs[index] task_w = outputs[index + 1] acc = np.sum(each_task_acc * task_w) / np.sum(task_w) acc_list.append("%s acc: %f" % (task["task_name"], acc)) index += 2 print("feed_queue size", train_pyreader.queue.size()) epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress() if args.use_amp: print("current learning_rate:%f, loss scaling:%f" % (np_lr[0], l_scaling[0])) else: print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, constract_loss: %f, loss: %f, " "ppl: %f, %s, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(each_total_constract_loss), np.mean(each_total_cost), np.exp(np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w)), ", ".join(acc_list), skip_steps / used_time, current_file, mask_type)) time_begin = time.time() elif steps % skip_steps == 0: epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("feed_queue size", train_pyreader.queue.size()) print("epoch: %d, progress: %d/%d, step: %d, " "speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, skip_steps / used_time, current_file, mask_type)) time_begin = time.time() if not trainer_id == 0: continue if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, origin_train_program) if steps % args.validation_steps == 0: valid_list = predict() print("[validation_set] epoch: %d, step: %d, %s" % \ (epoch, steps, ", ".join(valid_list))) except fluid.core.EOFException: train_pyreader.reset() break
def _make_program(self, mode): prog = self._progs.get(mode, None) if prog is not None: return prog = self._orig_prog.clone() # NOTE: When defining learning rate scheduling in static-graph, ops to # increase the global step var and calculate learning rate would be # prepended into _orig_prog. test program maked by `_orig_prog.clone` # also would include these ops. Thus must prune these ops in test # program, otherwise the global step would be changed in test. if mode != 'train': for op in list(prog.global_block().ops): prog.global_block()._remove_op(0) if mode == 'train' and self.model._optimizer \ and self.model._optimizer._learning_rate_map: # HACK workaround learning rate map issue lr_var = self.model._optimizer._learning_rate_map[self._orig_prog] self.model._optimizer._learning_rate_map[prog] = lr_var losses = [] metrics = [] with fluid.program_guard(prog, self._startup_prog): ins = self.model._inputs lbls = self.model._labels if self.model._labels else [] inputs = [k.forward() for k in to_list(ins)] labels = [k.forward() for k in to_list(lbls)] self._label_vars[mode] = labels outputs = to_list(self.model.forward(*inputs)) if mode != 'test' and self.model._loss_function: losses = self.model._loss_function(outputs, labels) if self._nranks > 1 and mode != 'train': outputs = [_all_gather(o, self._nranks) for o in outputs] if mode != 'test': labels = [_all_gather(l, self._nranks) for l in labels] if mode != 'test': for metric in self.model._metrics: metrics.append( to_list(metric.add_metric_op(outputs, labels))) if mode == 'train' and self.model._optimizer: self._loss_endpoint = fluid.layers.sum(losses) if self._nranks > 1: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dist_strategy = DistributedStrategy() dist_strategy.mode = "collective" dist_strategy.collective_mode = "grad_allreduce" self.model._optimizer = fleet.distributed_optimizer( self.model._optimizer, strategy=dist_strategy) self.model._optimizer.minimize(self._loss_endpoint) if mode != 'train': # clone again to put it in test mode prog = prog.clone(for_test=True) self._input_vars[mode] = inputs self._progs[mode] = prog self._endpoints[mode] = { "output": outputs, "loss": losses, "metric": metrics }
def _build_programs(self): """ Build programs. Build train_program, eval_program and inference_program. Only use in static graph mode. """ if self.run_infer: self.startup_program = fluid.Program() # build infer program self.infer_program = fluid.Program() with fluid.program_guard(self.infer_program, self.startup_program): with fluid.unique_name.guard(): self.infer_feed_dict = inputs = self._get_feed_dict( is_infer=True) outputs = self.forward(inputs, is_infer=True) predictions = self.infer(inputs, outputs) self.infer_fetch_dict = predictions self.infer_program = self.infer_program.clone(for_test=True) self.program = self.infer_program else: if self.is_distributed: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 exec_strategy.num_iteration_per_drop_scope = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 dist_strategy.fuse_all_reduce_ops = True if self.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution = True if self.use_amp: dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = self.amp_loss_scaling self.dist_strategy = dist_strategy self.startup_program = fluid.Program() # build train program self.train_program = fluid.Program() with fluid.program_guard(self.train_program, self.startup_program): with fluid.unique_name.guard(): self.feed_dict = inputs = self._get_feed_dict() outputs = self.forward(inputs) if self.is_distributed and self.use_recompute: self.dist_strategy.recompute_checkpoints = outputs[ "checkpoints"] metrics, statistics = self.get_metrics_and_statistics( inputs, outputs) # build eval program self.eval_program = self.train_program.clone(for_test=True) self.eval_fetch_dict = {**metrics, **statistics} scheduled_lr = self.optimize(metrics) metrics["scheduled_lr"] = scheduled_lr self.train_fetch_dict = metrics self.program = self.train_program if self.is_distributed: self.train_program = fleet.main_program self.exe.run(self.startup_program) if self.init_pretraining_params != "": init_pretraining_params(self.exe, self.init_pretraining_params, self.program) elif self.init_checkpoint != "": init_checkpoint(self.exe, self.init_checkpoint, self.program) return
def main(): role = role_maker.PaddleCloudRoleMaker(is_collective=True) # new line 3 fleet.init(role) # new line 4 env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 0)) assert num_trainers != 0, "multi-machine training process must be started using distributed.launch..." trainer_id = int(env.get("PADDLE_TRAINER_ID", 0)) # set different seeds for different trainers random.seed(trainer_id) np.random.seed(trainer_id) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture assert cfg.use_gpu == True, "GPU must be supported for multi-machine training..." devices_num = fluid.core.get_cuda_device_count() if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) dist_strategy = DistributedStrategy() sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' dist_strategy.sync_batch_norm = sync_bn dist_strategy.nccl_comm_num = 1 exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = 30 dist_strategy.exec_strategy = exec_strategy dist_strategy.fuse_all_reduce_ops = True optimizer = fleet.distributed_optimizer( optimizer, strategy=dist_strategy) # new line 5 optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage(cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) exe.run(startup_prog) compiled_train_prog = fleet.main_program if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_iter, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_iter) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: assert six.PY3, "VisualDL requires Python >= 3.5" from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and trainer_id == 0: strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and trainer_id == 0: save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def train(args): """train start""" logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() # For Distributed Training. role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() dist_strategy = DistributedStrategy() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) if args.use_fp16: optimizer = decorate(optimizer, init_loss_scaling=args.loss_scaling) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program orig_train_program = fleet._origin_program train_loop(args, exe, train_program, orig_train_program, startup_program, dev_count, sum_cost, avg_cost, token_num, predict, pyreader)
def compress(args): shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) args.num_workers = 0 shuffle = False env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) use_data_parallel = num_trainers > 1 if use_data_parallel: # Fleet step 1: initialize the distributed environment role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST( mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" args.pretrained_model = False elif args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10( mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10( mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = "3, 32, 32" args.pretrained_model = False elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) if args.use_gpu: places = paddle.static.cuda_places() else: places = paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = args.batch_size batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size_per_card, shuffle=shuffle, drop_last=True) train_loader = paddle.io.DataLoader( train_dataset, places=place, batch_sampler=batch_sampler, feed_list=[image, label], return_list=False, use_shared_memory=True, num_workers=args.num_workers) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size_for_validation, shuffle=False) step_per_epoch = int( np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) if args.data == 'cifar10': label = paddle.reshape(label, [-1, 1]) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) # Fleet step 2: distributed strategy if use_data_parallel: dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = False dist_strategy.exec_strategy = paddle.static.ExecutionStrategy() dist_strategy.fuse_all_reduce_ops = False train_program = paddle.static.default_main_program() if args.pruning_strategy == 'gmp': # GMP pruner step 0: define configs for GMP, no need to define configs for the base training. configs = { 'stable_iterations': args.stable_epochs * step_per_epoch, 'pruning_iterations': args.pruning_epochs * step_per_epoch, 'tunning_iterations': args.tunning_epochs * step_per_epoch, 'resume_iteration': (args.last_epoch + 1) * step_per_epoch, 'pruning_steps': args.pruning_steps, 'initial_ratio': args.initial_ratio, } elif args.pruning_strategy == 'base': configs = None # GMP pruner step 1: initialize a pruner object by calling entry function. pruner = create_unstructured_pruner( train_program, args, place, configs=configs) if use_data_parallel: # Fleet step 3: decorate the origial optimizer and minimize it opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set) exe.run(paddle.static.default_startup_program()) if args.last_epoch > -1: assert args.checkpoint is not None and os.path.exists( args.checkpoint), "Please specify a valid checkpoint path." paddle.fluid.io.load_persistables( executor=exe, dirname=args.checkpoint, main_program=train_program) elif args.pretrained_model: assert os.path.exists( args. pretrained_model), "Pretrained model path {} doesn't exist".format( args.pretrained_model) def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.load(program, model_path) when possible paddle.fluid.io.load_vars( exe, args.pretrained_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current sparsity of the inference model is {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}". format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start train_start = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) # GMP pruner step 2: step() to update ratios and other internal states of the pruner. pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / args.log_period, ( train_reader_cost + train_run_cost ) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost ))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 learning_rate.step() reader_start = time.time() if use_data_parallel: # Fleet step 4: get the compiled program from fleet compiled_train_program = fleet.main_program else: compiled_train_program = paddle.static.CompiledProgram( paddle.static.default_main_program()) for i in range(args.last_epoch + 1, args.num_epochs): train(i, compiled_train_program) # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. pruner.update_params() _logger.info("The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) if (i + 1) % args.test_period == 0: test(i, val_program) if (i + 1) % args.model_period == 0: if use_data_parallel: fleet.save_persistables(executor=exe, dirname=args.model_path) else: paddle.fluid.io.save_persistables( executor=exe, dirname=args.model_path)
label = fluid.data(name='label', shape=[-1, 1], dtype='float32') y = fluid.layers.fc(x, size=1, param_attr=fluid.initializer.Constant(1.0)) fluid.layers.Print(y) # testing code v1 = fluid.layers.collective._c_allgather(y, fleet.worker_num(), use_calc_stream=True) v2 = fluid.layers.collective._c_allreduce(y, use_calc_stream=True) fluid.layers.Print(v1) fluid.layers.Print(v2) # end of testing code cost = fluid.layers.square_error_cost(y, label) loss = fluid.layers.reduce_sum(cost) optimizer = fluid.optimizer.SGD(learning_rate=0.0) strategy = DistributedStrategy() strategy.mode = "collective" strategy.collective_mode = "grad_allreduce" optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) # new line 5 optimizer.minimize(loss, fluid.default_startup_program()) #place = fluid.CUDAPlace(0) # to be commented line 1 place = fluid.CUDAPlace(int(os.environ['FLAGS_selected_gpus'])) # uncomment line 1 exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) #train_prog = fluid.default_main_program() # to be commented line 2 train_prog = fleet.main_program # uncomment line 2 x_data = np.ones(shape=[1, 2], dtype=np.float32) label_data = np.ones(shape=[1, 1], dtype=np.float32) out = exe.run(train_prog,
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir use_mixup = args.use_mixup use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = args.num_threads exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.enable_inplace = args.with_inplace if args.fuse: dist_strategy.fuse_all_reduce_ops = 1 dist_strategy.nccl_comm_num = args.nccl_comm_num role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) b_out = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args, dist_strategy=dist_strategy) if use_mixup: train_py_reader, train_cost, global_lr = b_out[0], b_out[1], b_out[2] train_fetch_vars = [train_cost, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) else: train_py_reader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4] train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) train_prog = fleet.main_program b_out_test = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args, dist_strategy=dist_strategy) test_py_reader, test_cost, test_acc1, test_acc5 = b_out_test[0],b_out_test[1],b_out_test[2],b_out_test[3] test_prog = test_prog.clone(for_test=True) test_prog = compiler.CompiledProgram(test_prog).with_data_parallel(loss_name=test_cost.name, exec_strategy=exec_strategy) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size print("train_batch_size: %d device_num:%d" % (train_batch_size, device_num)) test_batch_size = 16 # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_reader = reader.train(settings=args, data_dir=args.data_dir, pass_id_as_seed=shuffle_seed) test_reader = reader.val(settings=args, data_dir=args.data_dir) train_py_reader.decorate_paddle_reader(paddle.batch(train_reader, batch_size=train_batch_size)) test_py_reader.decorate_paddle_reader(paddle.batch(test_reader, batch_size=test_batch_size)) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) train_exe = exe params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] train_begin=time.time() batch_id = 0 time_record=[] try: while True: t1 = time.time() if use_mixup: loss, lr = train_exe.run(train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run(train_prog, fetch_list=train_fetch_list) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[1].append(acc1) train_info[2].append(acc5) t2 = time.time() period = t2 - t1 time_record.append(period) loss = np.mean(np.array(loss)) train_info[0].append(loss) lr = np.mean(np.array(lr)) train_time.append(period) if batch_id % 10 == 0: period = np.mean(time_record) speed = args.batch_size * 1.0 / period time_record=[] if use_mixup: print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}, speed {5}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.2f sec" % period, "%.2f" % speed)) else: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}, speed {7}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period, "%.2f" % speed)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() if not use_mixup: train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_end=time.time() train_speed = (batch_id * train_batch_size) / (train_end - train_begin) # test only run in last epoch if (pass_id + 1) == params["num_epochs"]: test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: test_speed = test_batch_size * 1.0 / period print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5},speed {6}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period, "%.2f" % test_speed)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() if trainer_id == 0: model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=fleet._origin_program) if args.benchmark_test: if not os.path.isdir("./benchmark_logs/"): os.makedirs("./benchmark_logs/") with open("./benchmark_logs/log_%d" % trainer_id, 'w') as f: result = dict() result['0'] = dict() result['0']['acc1'] = str(test_acc1) result['0']['acc5'] = str(test_acc5) result['1'] = str(train_speed * num_trainers) print(result) f.writelines(json.dumps(result) + '\n') if use_mixup: print("End pass {0}, train_loss {1}, speed {2}".format(pass_id, "%.5f"%train_loss, "%.2f" % train_speed)) else: print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, ""speed {4}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.2f" % train_speed)) sys.stdout.flush()
def run_gpu_fleet_api_trainer(self, args): assert args.update_method == "nccl2" self.lr = args.lr exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.fuse_memory_size = 1 # MB dist_strategy.fuse_laryer_size = 1 if args.use_local_sgd: dist_strategy.use_local_sgd = True if args.ut4grad_allreduce: dist_strategy._ut4grad_allreduce = True if args.sync_batch_norm: dist_strategy.sync_batch_norm = True role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) print_to_err("gpu_fleet", "fleet.node_num:") # "fleet.node_id:", fleet.node_id(), # "fleet.trainer_num:", fleet.worker_num()) test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy) trainer_prog = fleet._origin_program dist_prog = fleet.main_program device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) eprint(type(self).__name__, "run worker startup program done.") feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] eprint("feed_var_list:", feed_var_list) # tmp add this code to pass python35 gcc8 CI # Fixme(gongweibao, wangxi), need fix fleet api program order if feed_var_list[0].name == 'label': feed_var_list = feed_var_list[::-1] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss, = exe.run(dist_prog, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses)) if args.save_model: model_save_dir = "/tmp" if fleet.worker_index() == 0: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer") else: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables_2") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables_2") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer_2") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer_2") fluid.io.save_persistables(exe, model_save_dir_fluid, fleet._origin_program) fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) feeded_var_names = [var.name for var in feed_var_list] fluid.io.save_inference_model(infer_save_dir_fluid, feeded_var_names, [avg_cost], exe, fleet._origin_program) fleet.save_inference_model(exe, infer_save_dir_fleet, feeded_var_names, [avg_cost])
def net(self): args = self.p_args() bert_config = BertConfig("uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = 1 if args.do_train: my_dist_env = dist_env() worker_endpoints_env = my_dist_env["trainer_endpoints"] worker_endpoints = worker_endpoints_env.split(",") current_endpoint = my_dist_env["current_endpoint"] trainer_id = worker_endpoints.index(current_endpoint) # new rolemaker here print("current_id: ", trainer_id) print("worker_endpoints: ", worker_endpoints) role = role_maker.UserDefinedCollectiveRoleMaker( current_id=trainer_id, worker_endpoints=worker_endpoints) # Fleet get role of each worker fleet.init(role) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = len(worker_endpoints) # we need to keep every trainer of fleet the same shuffle_seed print("shuffle_seed: ", args.shuffle_seed) self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = int(5 * 0.1) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 dist_strategy.use_hierarchical_allreduce = True #dist_strategy.mode = "collective" #dist_strategy.collective_mode = "grad_allreduce" with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
def train(self): self._check() self.has_run_train = True trainer_id = self.trainer_id num_trainers = self.num_trainers strategy = None if num_trainers > 1: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = DistributedStrategy() strategy.mode = "collective" strategy.collective_mode = "grad_allreduce" emb, loss, acc1, acc5, optimizer = self.build_program( True, False, dist_strategy=strategy) global_lr = optimizer._global_learning_rate(program=self.train_program) if num_trainers > 1: origin_prog = fleet._origin_program train_prog = fleet.main_program else: origin_prog = self.train_program train_prog = self.train_program gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = fluid.CUDAPlace(gpu_id) exe = fluid.Executor(place) exe.run(self.startup_program) if self.checkpoint_dir: load_checkpoint = True else: load_checkpoint = False if load_checkpoint: self.load_checkpoint(executor=exe, main_program=origin_prog) if self.train_reader is None: train_reader = paddle.batch(reader.arc_train( self.dataset_dir, self.num_classes), batch_size=self.train_batch_size) else: train_reader = self.train_reader feeder = fluid.DataFeeder(place=place, feed_list=['image', 'label'], program=origin_prog) if self.calc_train_acc: fetch_list = [loss.name, global_lr.name, acc1.name, acc5.name] else: fetch_list = [loss.name, global_lr.name] local_time = 0.0 nsamples = 0 inspect_steps = self.log_period global_batch_size = self.global_train_batch_size for pass_id in range(self.train_epochs): self.train_pass_id = pass_id train_info = [[], [], [], []] local_train_info = [[], [], [], []] for batch_id, data in enumerate(train_reader()): nsamples += global_batch_size t1 = time.time() acc1 = None acc5 = None if self.calc_train_acc: loss, lr, acc1, acc5 = exe.run(train_prog, feed=feeder.feed(data), fetch_list=fetch_list, use_program_cache=True) else: loss, lr = exe.run(train_prog, feed=feeder.feed(data), fetch_list=fetch_list, use_program_cache=True) t2 = time.time() period = t2 - t1 local_time += period train_info[0].append(np.array(loss)[0]) train_info[1].append(np.array(lr)[0]) local_train_info[0].append(np.array(loss)[0]) local_train_info[1].append(np.array(lr)[0]) if batch_id % inspect_steps == 0: avg_loss = np.mean(local_train_info[0]) avg_lr = np.mean(local_train_info[1]) speed = nsamples / local_time if self.calc_train_acc: logger.info( "Pass:{} batch:{} lr:{:.8f} loss:{:.6f} " "qps:{:.2f} acc1:{:.6f} acc5:{:.6f}".format( pass_id, batch_id, avg_lr, avg_loss, speed, acc1[0], acc5[0])) else: logger.info("Pass:{} batch:{} lr:{:.8f} loss:{:.6f} " "qps:{:.2f}".format( pass_id, batch_id, avg_lr, avg_loss, speed)) local_time = 0 nsamples = 0 local_train_info = [[], [], [], []] train_loss = np.array(train_info[0]).mean() logger.info("End pass {}, train_loss {:.6f}".format( pass_id, train_loss)) sys.stdout.flush() if self.with_test: self.test() # save model if self.model_save_dir: model_save_dir = os.path.join(self.model_save_dir, str(pass_id)) if not os.path.exists(model_save_dir): # may be more than one processes trying # to create the directory try: os.makedirs(model_save_dir) except OSError as exc: if exc.errno != errno.EEXIST: raise pass if trainer_id == 0: fluid.io.save_persistables(exe, model_save_dir, origin_prog) else: def save_var(var): to_save = "dist@" in var.name and '@rank@' in var.name return to_save and var.persistable fluid.io.save_vars(exe, model_save_dir, origin_prog, predicate=save_var) # save training info if self.model_save_dir and trainer_id == 0: config_file = os.path.join(self.model_save_dir, str(pass_id), 'meta.json') train_info = dict() train_info["pretrain_nranks"] = self.num_trainers train_info["emb_dim"] = self.emb_dim train_info['num_classes'] = self.num_classes with open(config_file, 'w') as f: json.dump(train_info, f) # upload model if self.model_save_dir and self.fs_name and trainer_id == 0: self.put_files_to_hdfs(self.model_save_dir)
role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) # PE training main_prog = fluid.Program() start_prog = fluid.Program() with fluid.program_guard(main_prog, start_prog): x = fluid.data(name='x', shape=[-1, 2], dtype='float32') label = fluid.data(name='label', shape=[-1, 1], dtype='float32') y = fluid.layers.fc(x, size=1, param_attr=fluid.initializer.Constant(1.0)) cost = fluid.layers.square_error_cost(y, label) loss = fluid.layers.reduce_sum(cost) optimizer = fluid.optimizer.SGD(learning_rate=0.0) strategy = DistributedStrategy() optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss, start_prog) place = fluid.CUDAPlace(int(os.environ['FLAGS_selected_gpus'])) exe = fluid.Executor(place) exe.run(start_prog) train_prog = fleet.main_program x_data = np.ones(shape=[1, 2], dtype=np.float32) label_data = np.ones(shape=[1, 1], dtype=np.float32) out = exe.run(train_prog, feed={ 'x': x_data, 'label': label_data },
def net(self, args=None): """ BERT net struct. Args: fleet: args (ArgumentParser): run args to config dist fleet. Returns: tuple: the return value contains avg_cost, py_reader """ args = p_args() bert_config = BertConfig(DATA_DIR + "uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = 1 self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = 0.5 exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() args.run_params = json.loads(args.run_params) dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] dist_strategy.exec_strategy = exec_strategy dist_strategy.use_hierarchical_allreduce = False with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
def do_training(self, fleet, args): """ begin training. Args: fleet (Collective): Collective inherited base class Fleet args (ArgumentParser): run args to config dist fleet. Returns: tuple: the value is train losses """ args = parse_args() logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() args.run_params = json.loads(args.run_params) dist_strategy = DistributedStrategy() dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = fluid.optimizer.SGD(0.003) if args.run_params["fp16"]: optimizer = decorate(optimizer, init_loss_scaling=64.0) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program exe.run(startup_program) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper) loss_normalizer = -( (1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) step_idx = 0 init_flag = True result_loss = [] result_ppl = [] train_info = [] for pass_id in six.moves.xrange(args.num_epochs): pass_start_time = time.time() if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() batch_id = 0 while True: try: feed_dict_list = prepare_feed_dict_list( data_generator, init_flag, dev_count) t1 = time.time() outs = exe.run(program=train_program, fetch_list=[sum_cost.name, token_num.name] if step_idx % args.fetch_steps == 0 else [], feed=feed_dict_list) if step_idx % args.fetch_steps == 0: sum_cost_val, token_num_val = np.array( outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num result_loss.append(total_avg_cost - loss_normalizer) result_ppl.append( np.exp([min(total_avg_cost, 100)]).item(0)) train_info.append(result_loss) init_flag = False batch_id += 1 step_idx += 1 if batch_id >= 5: break except (StopIteration, fluid.core.EOFException): if args.use_py_reader: pyreader.reset() break train_info = [round(i, 6) for i in train_info[0]] return train_info