def check(self, place, use_cuda): paddle.manual_seed(1) paddle.framework.random._manual_program_seed(1) main_program = fluid.Program() startup_program = fluid.Program() x, y, loss = self.build_program(main_program, startup_program, use_cuda) exe = fluid.Executor(place) iters = 10 batch_size = 16 feeder = fluid.DataFeeder(feed_list=[x, y], place=place) # close fused_bn_act_ops build_strategy = fluid.BuildStrategy() build_strategy.fuse_bn_act_ops = False binary = fluid.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=batch_size) loss_vals = [] scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup_program) for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) loss_vals.append(loss_v[0][0]) # open fused_bn_act_ops build_strategy_fused = fluid.BuildStrategy() build_strategy_fused.fuse_bn_act_ops = True binary_fused = fluid.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy_fused) train_reader_fused = paddle.batch(paddle.dataset.mnist.train(), batch_size=batch_size) loss_vals_fused = [] scope_fused = fluid.Scope() with fluid.scope_guard(scope_fused): exe.run(startup_program) for _ in range(iters): data = next(train_reader_fused()) loss_v = exe.run(binary_fused, feed=feeder.feed(data), fetch_list=[loss]) loss_vals_fused.append(loss_v[0][0]) # check loss for i in range(iters): self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
def check(self, place, use_cuda): paddle.seed(1) paddle.framework.random._manual_program_seed(1) iters = 5 batch_size = 16 # build_fused_program: turn on fuse_bn_add_act_ops main_program = fluid.Program() startup_program = fluid.Program() loss = self.build_origin_program(main_program, startup_program, use_cuda) build_strategy_fused = fluid.BuildStrategy() build_strategy_fused.fuse_bn_add_act_ops = True binary_fused = fluid.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy_fused) exe = fluid.Executor(place) loss_vals_fused = [] x_data = [] y_data = [] scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup_program) for _ in range(iters): x = np.random.random((batch_size, 1, 28, 28)).astype("float32") y = np.random.random((batch_size, 1)).astype("int64") x_data.append(x) y_data.append(y) loss_v = exe.run(binary_fused, feed={"x": x, "y": y}, fetch_list=[loss]) loss_vals_fused.append(loss_v[0][0]) # build_origin_program: turn off fused_bn_act_ops build_strategy = fluid.BuildStrategy() build_strategy.fuse_bn_add_act_ops = False binary = fluid.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy_fused) loss_vals = [] scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup_program) for i in range(iters): loss_v = exe.run(binary, feed={"x": x_data[i], "y": y_data[i]}, fetch_list=[loss]) loss_vals.append(loss_v[0][0]) # check loss for i in range(iters): self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
def _compile_and_initialize(self, prog, mode): compiled_prog = self._compiled_progs.get(mode, None) if compiled_prog is not None: return compiled_prog assert self.model._place is not None, \ "device is not set, please call `model.prepare()` first" place = self.model._place # XXX *ALL WEIGHTS* should be initialized upon model construction # even if `forward()` may run different code path for different mode # therefore startup program only needs to run once if self._executor is None: self._executor = fluid.Executor(place) # XXX incremental initialization uninitialized = [] for var_py in self._startup_prog.list_vars(): var = fluid.global_scope().find_var(var_py.name) if not var_py.name.startswith('nccl_id') and var and \ var.get_tensor()._is_initialized(): continue uninitialized.append(var_py) if uninitialized: startup_prog = self._startup_prog._prune(uninitialized) self._executor.run(startup_prog) if self._nranks < 2: compiled_prog = fluid.CompiledProgram(prog) else: compiled_prog = prog self._compiled_progs[mode] = compiled_prog
def best_strategy_compiled(args, program, loss, exe): """make a program which wrapped by a compiled program """ if os.getenv('FLAGS_use_ngraph'): return program else: build_strategy = fluid.compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 10 num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if num_trainers > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, program) # NOTE: the process is fast when num_threads is 1 # for multi-process training. exec_strategy.num_threads = 1 compiled_program = fluid.CompiledProgram(program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) return compiled_program
def init_model(self): """ 根据模型参数路径读入模型来初始化,包括预测程序编译,模型参数赋值,并行策略 :param vocab_size: 词典大小 :return: """ model_path = self.args["load_model_path"] self.logger.info("Initializing predict model...") self.exe = fluid.Executor( TrainEngine.get_executor_run_places(self.args)) with fluid.program_guard(self.predict_program, self.predict_startup): # 根据gzl的模型来定义网络,输出占位符 loader, probs, qas_id = classifier.create_model_for_cls_merge( args=self.args_model_build, is_prediction=True) self.logger.info("Prediction neural network created.") self.logger.info("Prediction neural network parameter initialized.") # start_up程序运行初始参数 self.exe.run(self.predict_startup) # 加载模型参数到网络中 load_model_params(self.exe, model_path, self.predict_program) # 若并行,用并行编译program if self.args["use_parallel"]: build_strategy = fluid.BuildStrategy() # 并行策略暂时写死 build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.predict_program = fluid.CompiledProgram(self.predict_program). \ with_data_parallel(places=TrainEngine.get_data_run_places(self.args), build_strategy=build_strategy) self.logger.info("Finish initializing predict model!") return loader, probs, qas_id
def run_program(enable_addto): np.random.seed(10) paddle.seed(10) paddle.framework.random._manual_program_seed(10) if fluid.core.is_compiled_with_cuda(): fluid.set_flags({"FLAGS_cudnn_deterministic": True}) fluid.set_flags({"FLAGS_max_inplace_grad_add": 2}) loss, main, startup, w = create_program(data_format=data_format) place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() exe = fluid.Executor(place) strategy = fluid.BuildStrategy() strategy.enable_addto = enable_addto compiled = fluid.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=strategy) exe.run(startup) img = np.random.uniform(-128, 128, [8, 3, 224, 224]).astype(np.float32) for i in range(10): res = exe.run(compiled, feed={'img': img}, fetch_list=[loss.name, w.name]) return res
def _init_pred(self, instance, infer_model_path): inst = instance if 'pred_output_path' not in inst.config: inst.config['pred_output_path'] = os.path.join( inst.config.get('save_path', '.'), inst.name) if not os.path.exists(inst.config['pred_output_path']): os.makedirs(inst.config['pred_output_path']) pred_backbone = self.Backbone(self.bb_conf, phase='pred') pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=self.bb_conf) inst.task_layer['pred'] = pred_parad pred_joint_input_names, pred_joint_shape_and_dtypes, name_to_position = merge_input_attrs( pred_backbone.inputs_attr, inst.task_layer['pred'].inputs_attrs['reader'], insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False) pred_prog = inst.load(infer_model_path) pred_prog = fluid.CompiledProgram(pred_prog).with_data_parallel() if inst.reader['pred'] is None: pred_reader = inst.Reader(inst.config, phase='pred') inst.reader['pred'] = pred_reader return pred_prog
def train(use_cuda): # define program train_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): # For training: # inputs = [src, src_sequence_length, trg, trg_sequence_length, label] inputs, loader = data_func(is_train=True) logits = model_func(inputs, is_train=True) loss = loss_func(logits, inputs[-1], inputs[-2]) optimizer = optimizer_func() optimizer.minimize(loss) # define data source places = fluid.cuda_places() if use_cuda else fluid.cpu_places() loader.set_batch_generator(inputs_generator(batch_size, eos_id, is_train=True), places=places) exe = fluid.Executor(places[0]) exe.run(startup_prog) prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name) EPOCH_NUM = 20 for pass_id in six.moves.xrange(EPOCH_NUM): batch_id = 0 for data in loader(): loss_val = exe.run(prog, feed=data, fetch_list=[loss])[0] print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, loss_val)) batch_id += 1 fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
def main_impl(self, place): image = fluid.layers.data(name='image', shape=self.image_shape, dtype='float32') relu_image = fluid.layers.relu(image) loss = fluid.layers.reduce_mean(relu_image) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) compiled_prog = fluid.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) image_tensor = fluid.LoDTensor() np_image = np.random.uniform(low=-10, high=10, size=self.image_shape).astype('float32') image_tensor.set(np_image, place) feed_dict = [{image.name: image_tensor}] for _ in range(self.iteration): exe.run(compiled_prog, feed=feed_dict, fetch_list=[loss.name]) self.assertTrue(np.array_equal(np.array(image_tensor), np_image))
def run_main_with_place(self, places, use_compiled_program=True): with fluid.scope_guard(fluid.Scope()): with fluid.program_guard(fluid.Program(), fluid.Program()): input_data, loss, loader = self.build_network(places) fetch_list = [input_data] exe = fluid.Executor(places[0]) exe.run(fluid.default_startup_program()) dev_cnt = len(places) if dev_cnt > 1: self.assertTrue(use_compiled_program) main_program = fluid.default_main_program() if use_compiled_program: main_program = fluid.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name, places=places) max_batch_num = min(self.break_num, int(self.batch_num / dev_cnt)) if loader.iterable: early_break = False for epoch_id in six.moves.range(self.epoch_num): early_break = False batch_id = 0 for data in loader(): if batch_id >= self.break_num: early_break = True break self.assertInputData(batch_id, data, dev_cnt) fetch_val, = exe.run(program=main_program, feed=data, fetch_list=fetch_list) self.assertInputData(batch_id, fetch_val, dev_cnt) batch_id += 1 self.assertEqual(batch_id, max_batch_num) if early_break: loader._reset() else: for epoch_id in six.moves.range(self.epoch_num): batch_id = 0 loader.start() try: while True: if batch_id >= self.break_num: loader.reset() break fetch_val, = exe.run(program=main_program, fetch_list=fetch_list) self.assertInputData(batch_id, fetch_val, dev_cnt) batch_id += 1 except fluid.core.EOFException: loader.reset() self.assertEqual(batch_id, max_batch_num)
def abs_max_run(self, reader, exe, step=None, loss_name=None): fetch_list = [] with fluid.program_guard(self.program): for act_name in self.real_names: act = self.program.global_block().var(act_name) act = fluid.layers.reduce_max( fluid.layers.abs(act), name=act_name + "_reduced") fetch_list.append(act_name + "_reduced.tmp_0") if not hasattr(self.program, '_program'): # Compile the native program to speed up program = fluid.CompiledProgram(self.program).with_data_parallel( loss_name=loss_name) for idx, data in enumerate(reader): vars_np = exe.run(program=program, feed=data, fetch_list=fetch_list) vars_np = [np.max(var) for var in vars_np] mapped_vars_np = dict(zip(self.real_names, vars_np)) values = self.update(mapped_vars_np) if idx % 10 == 0: _logger.info("Collecting..., Step: {}".format(idx)) if step is not None and idx + 1 >= step: break return values
def _freeze(self): """ call before enter train loop convert program to compiled program will do nothing if loss is None i.e. not in train mode """ if self._loss is None: log.debug('will not freeze a program without loss') return if isinstance(self._program.train_program, F.compiler.CompiledProgram): log.debug('program has already been built') return exec_strategy = F.ExecutionStrategy() exec_strategy.num_threads = 4 #2 for fp32 4 for fp16 exec_strategy.use_experimental_executor = True exec_strategy.num_iteration_per_drop_scope = 10 #important shit build_strategy = F.BuildStrategy() build_strategy.remove_unnecessary_lock = False #build_strategy.fuse_broadcast_ops = True build_strategy.num_trainers = distribution.status.num_replica build_strategy.trainer_id = distribution.status.replica_id build_strategy.memory_optimize = True log.info('replica id %d of %d' % (distribution.status.replica_id, distribution.status.num_replica)) program = F.CompiledProgram(self._program.train_program).with_data_parallel( loss_name=self._loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) self._program = ProgramPair(train_program=program, startup_program=self._program.startup_program)
def test(program): compiled_eval_prog = fluid.CompiledProgram(program) results = eval_run( exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg=cfg) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution dataset = cfg['EvalReader']['dataset'] box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) return box_ap_stats[0]
def main(): seg_num = 8 target_size = 224 video_files = [FLAGS.data + '/' + f for f in os.listdir(FLAGS.data)] pipeline = VideoPipe(video_files, seg_num, target_size, FLAGS.stride) video_loader = DALIGenericIterator(pipeline, ['image'], len(video_files), dynamic_shape=True) exe = fluid.Executor(fluid.CUDAPlace(0)) startup_prog = fluid.Program() eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): fetch_list = build(seg_num, target_size) exe.run(startup_prog) compiled_eval_prog = fluid.CompiledProgram(eval_prog) load_weights(exe, eval_prog, PRETRAIN_WEIGHTS) labels = json.load(open("kinetics_labels.json")) for idx, batch in enumerate(video_loader): fetches = exe.run(compiled_eval_prog, feed=batch, fetch_list=fetch_list) pred = fetches[0][0] topk_indices = pred.argsort()[0 - FLAGS.topk:] topk_labels = [labels[i] for i in topk_indices] filename = video_files[idx] print("prediction for {} is: {}".format(filename, topk_labels))
def compile(config, program, loss_name=None): """ Compile the program Args: config(dict): config program(): the program which is wrapped by loss_name(str): loss name Returns: compiled_program(): a compiled program """ build_strategy = fluid.compiler.BuildStrategy() #build_strategy.fuse_bn_act_ops = config.get("fuse_bn_act_ops") #build_strategy.fuse_elewise_add_act_ops = config.get("fuse_elewise_add_act_ops") exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.num_iteration_per_drop_scope = 10 compiled_program = fluid.CompiledProgram(program).with_data_parallel( loss_name=loss_name, build_strategy=build_strategy, exec_strategy=exec_strategy) return compiled_program
def test_main(self): main_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(main_prog, startup_prog): pred = fluid.data(name='pred', shape=[None, self.class_num], dtype='float32') label = fluid.data(name='label', shape=[None, 1], dtype='int64') acc = Accuracy(topk=self.topk, name=self.name) state = acc.add_metric_op(pred, label) exe = fluid.Executor(fluid.CPUPlace()) compiled_main_prog = fluid.CompiledProgram(main_prog) for i in range(10): label, pred = self.random_pred_label() state_ret = exe.run(compiled_main_prog, feed={'pred': pred, 'label': label}, fetch_list=[s.name for s in to_list(state)], return_numpy=True) acc.update(*state_ret) res_m = acc.accumulate() res_f = accuracy(pred, label, self.topk) assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \ "Accuracy precision error: {} != {}".format(res_m, res_f) acc.reset() assert np.sum(acc.total) == 0 assert np.sum(acc.count) == 0
def compile(config, program, loss_name=None, share_prog=None): """ Compile the program Args: config(dict): config program(): the program which is wrapped by loss_name(str): loss name share_prog(): the shared program, used for evaluation during training Returns: compiled_program(): a compiled program """ build_strategy = fluid.compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.num_iteration_per_drop_scope = 10 compiled_program = fluid.CompiledProgram(program).with_data_parallel( share_vars_from=share_prog, loss_name=loss_name, build_strategy=build_strategy, exec_strategy=exec_strategy) return compiled_program
def test_program_feed_scalar(self): main_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(main_program, startup_program): with fluid.scope_guard(scope): lr, cost = self.net() cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) exe.run(startup_program) print(scope.find_var("fc_0.w_0").get_tensor()) compiled_prog = fluid.CompiledProgram( main_program).with_data_parallel(loss_name=cost.name) train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype('float32') y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype('float32') self.assertRaises(AssertionError, exe.run, compiled_prog, feed={ 'x': train_data, 'y': y_true, 'lr': 0.01 }, fetch_list=[lr, cost])
def test_compiled_program_feed_scalar(self): main_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(main_program, startup_program): with fluid.scope_guard(scope): lr, cost = self.net() cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) exe.run(startup_program) compiled_prog = fluid.CompiledProgram( main_program).with_data_parallel(loss_name=cost.name) train_data = numpy.array( [[1.0], [2.0], [3.0], [4.0]]).astype('float32') y_true = numpy.array( [[2.0], [4.0], [6.0], [8.0]]).astype('float32') a = 0.01 _lr, _ = exe.run(compiled_prog, feed={'x': train_data, 'y': y_true, 'lr': a}, fetch_list=[lr, cost], return_numpy=False) self.assertEqual(_lr._dtype(), lr.dtype) self.assertEqual(_lr._dtype(), fluid.core.VarDesc.VarType.FP32) self.assertEqual(type(a), float)
def _get_gradient(self, input_to_check, place, output_names, no_grad_set, parallel=False): prog = Program() block = prog.global_block() self._append_ops(block) loss = append_loss_ops(block, output_names) param_grad_list = append_backward(loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) inputs = self._get_inputs(block) feed_dict = self.feed_var(inputs, place) fetch_list = [g for p, g in param_grad_list] if parallel: use_cuda = False if isinstance(place, fluid.CUDAPlace): use_cuda = True compiled_prog = fluid.CompiledProgram(prog).with_data_parallel( loss_name=loss.name, places=place) prog = compiled_prog executor = fluid.Executor(place) return list( map(np.array, executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
def train(self, print_steps=5): """ start training. Args: print_steps: int. Logging frequency of training message, e.g., current step, loss and speed. """ iterator = self._train_iterator self._distribute_train_prog = fluid.CompiledProgram(self._train_prog).with_data_parallel(loss_name=self._loss_var.name) time_begin = time.time() for feed in iterator: rt_outputs = self.train_one_step(feed) task_rt_outputs = {k[len(self.name+'.'):]: v for k,v in rt_outputs.items() if k.startswith(self.name+'.')} self._task_head.batch_postprocess(task_rt_outputs) if print_steps > 0 and self._cur_train_step % print_steps == 0: loss = rt_outputs[self.name+'.loss'] loss = np.mean(np.squeeze(loss)).tolist() time_end = time.time() time_cost = time_end - time_begin print("step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s".format( (self._cur_train_step-1) % self._steps_pur_epoch + 1 , self._steps_pur_epoch, self._cur_train_epoch, loss, print_steps / time_cost)) sys.stdout.flush() time_begin = time.time() if self._num_epochs is None and not self._multi_task and self._cur_train_step == self._steps_pur_epoch: break
def test_prune_compiled_program(self): program = framework.Program() startup_program = framework.Program() scope = fluid.Scope() with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) sgd_optimizer.minimize(loss1) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) compiled_prog = fluid.CompiledProgram( program).with_data_parallel(loss_name=loss1.name, places=fluid.CPUPlace()) weight_init = np.array( scope.find_var(w_param_attrs.name).get_tensor()) x_np = np.random.random(size=(10, 2)).astype('float32') label_np = np.random.randint(1, size=(10, 1)).astype('int64') res = exe.run(compiled_prog, feed={ 'x': x_np, 'label': label_np }, fetch_list=[loss1.name], use_prune=True) self.assertIsNotNone(scope.find_var(loss1.name)) self.assertIsNone(scope.find_var(loss2.name)) weight = np.array( scope.find_var(w_param_attrs.name).get_tensor()) self.assertFalse(np.array_equal(weight_init, weight)) # weight changed
def _build_env(self): """ building the program and strategy for specific running phase. """ if self.env.is_inititalized: return self._build_env_start_event() self.env.is_inititalized = True self.env.main_program = clone_program(self._base_main_program, for_test=False) self.env.startup_program = fluid.Program() with fluid.program_guard(self.env.main_program, self._base_startup_program): with fluid.unique_name.guard(self.env.UNG): self.env.outputs = self._build_net() if self.is_train_phase or self.is_test_phase: self.env.labels = self._add_label() self.env.loss = self._add_loss() self.env.metrics = self._add_metrics() if self.is_predict_phase or self.is_test_phase: self.env.main_program = clone_program(self.env.main_program, for_test=True) hub.common.paddle_helper.set_op_attr(self.env.main_program, is_test=True) if self.config.enable_memory_optim: for var_name in self.fetch_list: var = self.env.main_program.global_block().vars[var_name] var.persistable = True if self.is_train_phase: with fluid.program_guard(self.env.main_program, self._base_startup_program): with fluid.unique_name.guard(self.env.UNG): self.scheduled_lr, self.max_train_steps = self.config.strategy.execute( self.loss, self._base_data_reader, self.config, self.device_count) if self.is_train_phase: loss_name = self.env.loss.name else: loss_name = None share_vars_from = self._base_compiled_program if not self.config.use_data_parallel: self.env.main_program_compiled = None else: self.env.main_program_compiled = fluid.CompiledProgram( self.env.main_program).with_data_parallel( loss_name=loss_name, share_vars_from=share_vars_from, build_strategy=self.build_strategy, places=self.places) self.exe.run(self.env.startup_program) self._build_env_end_event()
def train_one_step(self, batch): if not self._dist_train_init: self._distribute_train_prog = fluid.CompiledProgram(self._train_prog).with_data_parallel(loss_name=self._loss_var.name) self._dist_train_init = True exe = self._exe distribute_train_prog = self._distribute_train_prog fetch_list = self._fetch_list if gpu_dev_count > 1: feed, mask = batch rt_outputs = exe.run(distribute_train_prog, feed=feed, fetch_list=fetch_list) num_fakes = decode_fake(len(rt_outputs[0]), mask, self._train_batch_size) if num_fakes: rt_outputs = [i[:-num_fakes] for i in rt_outputs] else: feed = self._feed_batch_process_fn(batch) rt_outputs = exe.run(distribute_train_prog, feed=feed, fetch_list=fetch_list) rt_outputs = {k:v for k,v in zip(self._fetch_names, rt_outputs)} self._cur_train_step += 1 self._check_save() self._cur_train_epoch = (self._cur_train_step-1) // self._steps_pur_epoch return rt_outputs
def run_main(self, num_workers, places, persistent_workers, use_pe=True): scope = fluid.Scope() with fluid.scope_guard(scope): startup_prog, main_prog, image, label, loss = simple_fc_net_static() dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM) dataloader = DataLoader( dataset, feed_list=[image, label], places=places, num_workers=num_workers, batch_size=BATCH_SIZE, return_list=False, drop_last=True, persistent_workers=persistent_workers) assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE) exe = fluid.Executor(place=places[0]) exe.run(startup_prog) if use_pe: prog = fluid.CompiledProgram(main_prog) if len(places) > 1: prog = prog.with_data_parallel( loss_name=loss.name, places=places) else: prog = main_prog step_list = [] loss_list = [] start_t = time.time() for _ in six.moves.range(EPOCH_NUM): step = 0 for d in dataloader: assert len(d) == len(places), "{} != {}".format( len(d), len(places)) for i, item in enumerate(d): image = item['image'] label = item['label'] assert image.shape() == [BATCH_SIZE, IMAGE_SIZE] assert label.shape() == [BATCH_SIZE, 1] assert image._place()._equals(places[i]) assert label._place()._equals(places[i]) L, = exe.run(program=prog, feed=d, fetch_list=[loss], use_program_cache=True) loss_list.append(np.mean(L)) step += 1 step_list.append(step) end_t = time.time() ret = { "time": end_t - start_t, "step": step_list, "loss": np.array(loss_list) } print("time cost", ret['time'], 'step_list', ret['step']) return ret
def compile_program_not_compiled(self): with fluid.program_guard(fluid.Program()): # build model self.build_simple_model() # compile program program = fluid.default_main_program() compiled_program = fluid.CompiledProgram( program).with_data_parallel() return compiled_program
def best_strategy_compiled(args, program, loss, exe, mode="train", share_prog=None): """make a program which wrapped by a compiled program """ if os.getenv('FLAGS_use_ngraph'): return program else: build_strategy = fluid.compiler.BuildStrategy() try: fluid.require_version(min_version='1.7.0') build_strategy.fuse_bn_act_ops = args.fuse_bn_act_ops except Exception as e: logger.info( "PaddlePaddle version 1.7.0 or higher is " "required when you want to fuse batch_norm and activation_op.") build_strategy.fuse_elewise_add_act_ops = args.fuse_elewise_add_act_ops try: build_strategy.fuse_bn_add_act_ops = args.fuse_bn_add_act_ops except Exception as e: logger.info( "PaddlePaddle 2.0-rc or higher is " "required when you want to enable fuse_bn_add_act_ops strategy." ) try: build_strategy.enable_addto = args.enable_addto except Exception as e: logger.info("PaddlePaddle 2.0-rc or higher is " "required when you want to enable addto strategy.") exec_strategy = fluid.ExecutionStrategy() if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 10000 if args.use_pure_fp16 else 10 num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if num_trainers > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, program) # NOTE: the process is fast when num_threads is 1 # for multi-process training. exec_strategy.num_threads = 1 compiled_program = fluid.CompiledProgram(program).with_data_parallel( loss_name=loss.name if mode == "train" else None, share_vars_from=share_prog if mode == "val" else None, build_strategy=build_strategy, exec_strategy=exec_strategy) return compiled_program
def test_get_valid_program_error(self): # case 1: CompiledProgram no program graph = core.Graph(core.ProgramDesc()) compiled_program = fluid.CompiledProgram(graph) with self.assertRaises(TypeError): fluid.io._get_valid_program(compiled_program) # case 2: main_program type error with self.assertRaises(TypeError): fluid.io._get_valid_program("program")
def create_multi_devices_program(program, loss_var_name): build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = True exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_iteration_per_drop_scope = 1 compile_program = fluid.CompiledProgram(program).with_data_parallel( loss_name=loss_var_name, build_strategy=build_strategy, exec_strategy=exec_strategy) return compile_program
def calc_sub_out(self, place=None, parallel=None): x = fluid.layers.ones(shape=[2, 2], dtype='float32') y = fluid.layers.ones(shape=[2, 2], dtype='float32') out = fluid.layers.elementwise_sub(x=x, y=y) program = fluid.default_main_program() if parallel: program = fluid.CompiledProgram(program).with_data_parallel( places=place) exe = fluid.Executor(place) out = exe.run(program, fetch_list=[out], return_numpy=False) return out