def net(self, args=None): """ BERT net struct. Args: fleet: args (ArgumentParser): run args to config dist fleet. Returns: tuple: the return value contains avg_cost, py_reader """ args = p_args() bert_config = BertConfig(DATA_DIR + "uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = 1 self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = 0.5 exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() args.run_params = json.loads(args.run_params) dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] dist_strategy.exec_strategy = exec_strategy dist_strategy.use_hierarchical_allreduce = False with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
def do_training(self, fleet, args): """ begin training. Args: fleet (Collective): Collective inherited base class Fleet args (ArgumentParser): run args to config dist fleet. Returns: tuple: the value is train losses """ args = parse_args() logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() args.run_params = json.loads(args.run_params) dist_strategy = DistributedStrategy() dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = fluid.optimizer.SGD(0.003) if args.run_params["fp16"]: optimizer = decorate(optimizer, init_loss_scaling=64.0) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program exe.run(startup_program) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper) loss_normalizer = -( (1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) step_idx = 0 init_flag = True result_loss = [] result_ppl = [] train_info = [] for pass_id in six.moves.xrange(args.num_epochs): pass_start_time = time.time() if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() batch_id = 0 while True: try: feed_dict_list = prepare_feed_dict_list( data_generator, init_flag, dev_count) t1 = time.time() outs = exe.run(program=train_program, fetch_list=[sum_cost.name, token_num.name] if step_idx % args.fetch_steps == 0 else [], feed=feed_dict_list) if step_idx % args.fetch_steps == 0: sum_cost_val, token_num_val = np.array( outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num result_loss.append(total_avg_cost - loss_normalizer) result_ppl.append( np.exp([min(total_avg_cost, 100)]).item(0)) train_info.append(result_loss) init_flag = False batch_id += 1 step_idx += 1 if batch_id >= 5: break except (StopIteration, fluid.core.EOFException): if args.use_py_reader: pyreader.reset() break train_info = [round(i, 6) for i in train_info[0]] return train_info
def _make_program(self, mode): prog = self._progs.get(mode, None) if prog is not None: return prog = self._orig_prog.clone() # NOTE: When defining learning rate scheduling in static-graph, ops to # increase the global step var and calculate learning rate would be # prepended into _orig_prog. test program maked by `_orig_prog.clone` # also would include these ops. Thus must prune these ops in test # program, otherwise the global step would be changed in test. if mode != 'train': for op in list(prog.global_block().ops): prog.global_block()._remove_op(0) if mode == 'train' and self.model._optimizer \ and self.model._optimizer._learning_rate_map: # HACK workaround learning rate map issue lr_var = self.model._optimizer._learning_rate_map[self._orig_prog] self.model._optimizer._learning_rate_map[prog] = lr_var losses = [] metrics = [] with fluid.program_guard(prog, self._startup_prog): ins = self.model._inputs lbls = self.model._labels if self.model._labels else [] inputs = [k.forward() for k in to_list(ins)] labels = [k.forward() for k in to_list(lbls)] self._label_vars[mode] = labels outputs = to_list(self.model.forward(*inputs)) if mode != 'test' and self.model._loss_function: losses = self.model._loss_function(outputs, labels) if self._nranks > 1 and mode != 'train': outputs = [_all_gather(o, self._nranks) for o in outputs] if mode != 'test': labels = [_all_gather(l, self._nranks) for l in labels] if mode != 'test': for metric in self.model._metrics: metrics.append( to_list(metric.add_metric_op(outputs, labels))) if mode == 'train' and self.model._optimizer: self._loss_endpoint = fluid.layers.sum(losses) if self._nranks > 1: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dist_strategy = DistributedStrategy() dist_strategy.mode = "collective" dist_strategy.collective_mode = "grad_allreduce" self.model._optimizer = fleet.distributed_optimizer( self.model._optimizer, strategy=dist_strategy) self.model._optimizer.minimize(self._loss_endpoint) if mode != 'train': # clone again to put it in test mode prog = prog.clone(for_test=True) self._input_vars[mode] = inputs self._progs[mode] = prog self._endpoints[mode] = { "output": outputs, "loss": losses, "metric": metrics }
y = fluid.layers.fc(x, size=1, param_attr=fluid.initializer.Constant(1.0)) fluid.layers.Print(y) # testing code v1 = fluid.layers.collective._c_allgather(y, fleet.worker_num(), use_calc_stream=True) v2 = fluid.layers.collective._c_allreduce(y, use_calc_stream=True) fluid.layers.Print(v1) fluid.layers.Print(v2) # end of testing code cost = fluid.layers.square_error_cost(y, label) loss = fluid.layers.reduce_sum(cost) optimizer = fluid.optimizer.SGD(learning_rate=0.0) strategy = DistributedStrategy() strategy.mode = "collective" strategy.collective_mode = "grad_allreduce" optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) # new line 5 optimizer.minimize(loss, fluid.default_startup_program()) #place = fluid.CUDAPlace(0) # to be commented line 1 place = fluid.CUDAPlace(int(os.environ['FLAGS_selected_gpus'])) # uncomment line 1 exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) #train_prog = fluid.default_main_program() # to be commented line 2 train_prog = fleet.main_program # uncomment line 2 x_data = np.ones(shape=[1, 2], dtype=np.float32) label_data = np.ones(shape=[1, 1], dtype=np.float32) out = exe.run(train_prog, feed={'x': x_data, 'label': label_data},
def train(self): self._check() self.has_run_train = True trainer_id = self.trainer_id num_trainers = self.num_trainers strategy = None if num_trainers > 1: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = DistributedStrategy() strategy.mode = "collective" strategy.collective_mode = "grad_allreduce" emb, loss, acc1, acc5, optimizer = self.build_program( True, False, dist_strategy=strategy) global_lr = optimizer._global_learning_rate(program=self.train_program) if num_trainers > 1: origin_prog = fleet._origin_program train_prog = fleet.main_program else: origin_prog = self.train_program train_prog = self.train_program gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = fluid.CUDAPlace(gpu_id) exe = fluid.Executor(place) exe.run(self.startup_program) if self.checkpoint_dir: load_checkpoint = True else: load_checkpoint = False if load_checkpoint: self.load_checkpoint(executor=exe, main_program=origin_prog) if self.train_reader is None: train_reader = paddle.batch(reader.arc_train( self.dataset_dir, self.num_classes), batch_size=self.train_batch_size) else: train_reader = self.train_reader feeder = fluid.DataFeeder(place=place, feed_list=['image', 'label'], program=origin_prog) if self.calc_train_acc: fetch_list = [loss.name, global_lr.name, acc1.name, acc5.name] else: fetch_list = [loss.name, global_lr.name] local_time = 0.0 nsamples = 0 inspect_steps = self.log_period global_batch_size = self.global_train_batch_size for pass_id in range(self.train_epochs): self.train_pass_id = pass_id train_info = [[], [], [], []] local_train_info = [[], [], [], []] for batch_id, data in enumerate(train_reader()): nsamples += global_batch_size t1 = time.time() acc1 = None acc5 = None if self.calc_train_acc: loss, lr, acc1, acc5 = exe.run(train_prog, feed=feeder.feed(data), fetch_list=fetch_list, use_program_cache=True) else: loss, lr = exe.run(train_prog, feed=feeder.feed(data), fetch_list=fetch_list, use_program_cache=True) t2 = time.time() period = t2 - t1 local_time += period train_info[0].append(np.array(loss)[0]) train_info[1].append(np.array(lr)[0]) local_train_info[0].append(np.array(loss)[0]) local_train_info[1].append(np.array(lr)[0]) if batch_id % inspect_steps == 0: avg_loss = np.mean(local_train_info[0]) avg_lr = np.mean(local_train_info[1]) speed = nsamples / local_time if self.calc_train_acc: logger.info( "Pass:{} batch:{} lr:{:.8f} loss:{:.6f} " "qps:{:.2f} acc1:{:.6f} acc5:{:.6f}".format( pass_id, batch_id, avg_lr, avg_loss, speed, acc1[0], acc5[0])) else: logger.info("Pass:{} batch:{} lr:{:.8f} loss:{:.6f} " "qps:{:.2f}".format( pass_id, batch_id, avg_lr, avg_loss, speed)) local_time = 0 nsamples = 0 local_train_info = [[], [], [], []] train_loss = np.array(train_info[0]).mean() logger.info("End pass {}, train_loss {:.6f}".format( pass_id, train_loss)) sys.stdout.flush() if self.with_test: self.test() # save model if self.model_save_dir: model_save_dir = os.path.join(self.model_save_dir, str(pass_id)) if not os.path.exists(model_save_dir): # may be more than one processes trying # to create the directory try: os.makedirs(model_save_dir) except OSError as exc: if exc.errno != errno.EEXIST: raise pass if trainer_id == 0: fluid.io.save_persistables(exe, model_save_dir, origin_prog) else: def save_var(var): to_save = "dist@" in var.name and '@rank@' in var.name return to_save and var.persistable fluid.io.save_vars(exe, model_save_dir, origin_prog, predicate=save_var) # save training info if self.model_save_dir and trainer_id == 0: config_file = os.path.join(self.model_save_dir, str(pass_id), 'meta.json') train_info = dict() train_info["pretrain_nranks"] = self.num_trainers train_info["emb_dim"] = self.emb_dim train_info['num_classes'] = self.num_classes with open(config_file, 'w') as f: json.dump(train_info, f) # upload model if self.model_save_dir and self.fs_name and trainer_id == 0: self.put_files_to_hdfs(self.model_save_dir)
def net(self, args=None): """ resnet struct. Args: fleet: args (ArgumentParser): run args to config dist fleet. Returns: tuple: the return value contains avg_cost, py_reader """ from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy from thirdparty.image_classfication.models.resnet import ResNet50 from thirdparty.image_classfication.train import parser from thirdparty.image_classfication.train import optimizer_setting parser.add_argument('--update_method', type=str, required=True, choices=['pserver', 'nccl']) parser.add_argument('--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") parser.add_argument('--current_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) # parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--run_params', type=str, required=False, default='{}') args = parser.parse_args() args.run_params = json.loads(args.run_params) image_shape = [3, 224, 224] scale_loss = 1.0 self.py_reader = fluid.layers.py_reader(capacity=16, shapes=[[-1] + image_shape, [-1, 1]], lod_levels=[0, 0], dtypes=["float32", "int64"], use_double_buffer=True) image, label = fluid.layers.read_file(self.py_reader) run_model = ResNet50() out = run_model.net(image, 4) softmax_out = fluid.layers.softmax(out, use_cudnn=False) cost, prob = fluid.layers.softmax_with_cross_entropy( out, label, return_softmax=True) self.avg_cost = fluid.layers.mean(cost) params = run_model.params params["total_images"] = args.total_images params["lr"] = 1e-5 params["num_epochs"] = args.num_epochs params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy params["l2_decay"] = args.l2_decay params["momentum_rate"] = args.momentum_rate optimizer = optimizer_setting(params) global_lr = optimizer._global_learning_rate() global_lr.persistable = True exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.num_iteration_per_drop_scope = 30 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] if args.run_params["fp16"]: optimizer = fluid.contrib.mixed_precision.decorate( optimizer, init_loss_scaling=128.0, use_dynamic_loss_scaling=True) if "use_dgc" in args.run_params and args.run_params["use_dgc"]: # use dgc must close fuse dist_strategy.fuse_all_reduce_ops = False optimizer = fluid.optimizer.DGCMomentumOptimizer( learning_rate=0.001, momentum=0.9, rampup_begin_step=0) dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) _, param_grads = dist_optimizer.minimize(self.avg_cost) shuffle_seed = 1 train_reader = reader.train(settings=args, data_dir=DATA_DIR, pass_id_as_seed=shuffle_seed) self.py_reader.decorate_paddle_reader( paddle.batch(train_reader, batch_size=self.batch_size)) if scale_loss > 1: avg_cost = fluid.layers.mean(x=cost) * scale_loss return self.avg_cost, self.py_reader