def _get_grad_vartype(self, name): assert self.program_desc is not None grad_name = name + core.grad_var_suffix() for i in six.moves.range(self.program_desc.num_blocks()): block = self.program_desc.block(i) var_desc = block.find_var_recursive(cpt.to_bytes(grad_name)) return var_desc.type() if var_desc is not None else None
def _load_persistable_vars_by_program(model_path, program_holder, params_filename=None): # make sure the path has been checked persistable_vars = _get_persistable_vars(program_holder.infer_program) load_var_dict = {} for each_var in persistable_vars: orig_each_name = program_holder._suffix_varname_dict[each_var.name()] if _is_parameter(each_var, program_holder.infer_program): # create output varbase new_var = framework.ParamBase(shape=each_var.shape(), dtype=each_var.dtype(), name=each_var.name(), type=each_var.type(), persistable=True) else: new_var = framework._varbase_creator(type=each_var.type(), name=each_var.name(), shape=each_var.shape(), dtype=each_var.dtype(), persistable=True) if params_filename is None: framework._dygraph_tracer().trace_op( type='load', inputs={}, outputs={'Out': new_var}, attrs={'file_path': os.path.join(model_path, orig_each_name)}) new_var.stop_gradient = False load_var_dict[each_var.name()] = new_var if params_filename is not None: load_var_list = [] for name in sorted(load_var_dict.keys()): load_var_list.append(load_var_dict[name]) framework._dygraph_tracer().trace_op( type='load_combine', inputs={}, outputs={'Out': load_var_list}, attrs={'file_path': os.path.join(model_path, params_filename)}) for each_var in persistable_vars: if not _is_parameter(each_var, program_holder.infer_program): continue param = load_var_dict[each_var.name()] param.stop_gradient = False # NOTE: [Recovery stop gradient information based on the program] # After loading the model, the stop_gradient information # of the original variable is lost, but if a parameter does not # have a corresponding @GRAD variable in the backward program, # it can be said that it is also stop_gradient all_var_names = _get_all_var_names(program_holder.train_program) for var_name in load_var_dict: grad_var_name = var_name + core.grad_var_suffix() if grad_var_name not in all_var_names: load_var_dict[var_name].stop_gradient = True return load_var_dict
def run_dygraph(): fluid.set_flags({'FLAGS_sort_sum_gradient': True}) paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.SGD( learning_rate=0.001, parameter_list=ocr_attention.parameters()) dy_param_init_value = {} for param in ocr_attention.parameters(): dy_param_init_value[param.name] = param.numpy() for epoch in range(epoch_num): for batch_id in range(batch_num): label_in = to_variable(label_in_np) label_out = to_variable(label_out_np) label_out.stop_gradient = True img = to_variable(image_np) dy_prediction = ocr_attention(img, label_in) label_out = fluid.layers.reshape(label_out, [-1, 1], inplace=False) dy_prediction = fluid.layers.reshape( dy_prediction, [label_out.shape[0], -1], inplace=False) loss = fluid.layers.cross_entropy(input=dy_prediction, label=label_out) avg_loss = fluid.layers.reduce_sum(loss) dy_out = avg_loss.numpy() if epoch == 0 and batch_id == 0: for param in ocr_attention.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward() dy_grad_value = {} for param in ocr_attention.parameters(): if param.trainable: np_array = np.array( param._grad_ivar().value().get_tensor()) dy_grad_value[param.name + core.grad_var_suffix()] = np_array optimizer.minimize(avg_loss) ocr_attention.clear_gradients() dy_param_value = {} for param in ocr_attention.parameters(): dy_param_value[param.name] = param.numpy() return dy_out, dy_param_init_value, dy_param_value
def _set_grad_type(self, params, train_program): # NOTE: if user set sparse gradient mode, the param's gradient # will be SelectedRows, not LoDTensor. But tracer will just # set param grad VarBase by forward VarBase(LoDTensor) # If we don't change grad_var type here, RunProgramOp need # transform SelectedRows to LoDTensor forcibly, it may not # be user wanted result. for param in params: grad_name = param.name + core.grad_var_suffix() grad_var = train_program.desc.block(0).find_var( cpt.to_bytes(grad_name)) # NOTE: cannot find var desc maybe no problem, such as in batch_norm if grad_var is None: continue param._set_grad_type(grad_var.type())
def test_resnet_sort_gradient_float32(self): seed = 90 batch_size = train_parameters["batch_size"] batch_num = 10 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True resnet = ResNet() optimizer = optimizer_setting( train_parameters, parameter_list=resnet.parameters()) np.random.seed(seed) import random random.seed = seed train_reader = paddle.batch( paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) dy_param_init_value = {} for param in resnet.parameters(): dy_param_init_value[param.name] = param.numpy() for batch_id, data in enumerate(train_reader()): if batch_id >= batch_num: break dy_x_data = np.array( [x[0].reshape(3, 224, 224) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( batch_size, 1) img = to_variable(dy_x_data) label = to_variable(y_data) label.stop_gradient = True out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) dy_out = avg_loss.numpy() if batch_id == 0: for param in resnet.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward(backward_strategy) dy_grad_value = {} for param in resnet.parameters(): if param.trainable: np_array = np.array(param._grad_ivar().value() .get_tensor()) dy_grad_value[param.name + core.grad_var_suffix( )] = np_array optimizer.minimize(avg_loss) resnet.clear_gradients() dy_param_value = {} for param in resnet.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) resnet = ResNet() optimizer = optimizer_setting(train_parameters) np.random.seed(seed) import random random.seed = seed train_reader = paddle.batch( paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) img = fluid.layers.data( name='pixel', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] static_grad_name_list = [] for param in resnet.parameters(): static_param_name_list.append(param.name) for param in resnet.parameters(): if param.trainable: static_grad_name_list.append(param.name + core.grad_var_suffix()) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for batch_id, data in enumerate(train_reader()): if batch_id >= batch_num: break static_x_data = np.array( [x[0].reshape(3, 224, 224) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( [batch_size, 1]) fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) fetch_list.extend(static_grad_name_list) out = exe.run(fluid.default_main_program(), feed={"pixel": static_x_data, "label": y_data}, fetch_list=fetch_list) static_param_value = {} static_grad_value = {} static_out = out[0] param_start_pos = 1 grad_start_pos = len(static_param_name_list) + param_start_pos for i in range(param_start_pos, len(static_param_name_list) + param_start_pos): static_param_value[static_param_name_list[ i - param_start_pos]] = out[i] for i in range(grad_start_pos, len(static_grad_name_list) + grad_start_pos): static_grad_value[static_grad_name_list[ i - grad_start_pos]] = out[i] self.assertTrue(np.allclose(static_out, dy_out)) self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any())) self.assertEqual(len(dy_grad_value), len(static_grad_value)) for key, value in six.iteritems(static_grad_value): self.assertTrue(np.allclose(value, dy_grad_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any())) self.assertEqual(len(dy_param_value), len(static_param_value)) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any()))
def test_while_op(self): seed = 90 epoch_num = 2 if core.is_compiled_with_cuda(): batch_num = 20 else: print("in CPU") batch_num = 2 np.random.seed = seed image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0], Config.DATA_SHAPE[1], Config.DATA_SHAPE[2]).astype('float32') label_in_np = np.arange( 0, Config.max_length, dtype='int64').reshape([1, Config.max_length]) for i in range(2, Config.batch_size + 1): label_in_np = np.vstack((label_in_np, np.arange( (i - 1) * Config.max_length, i * Config.max_length, dtype='int64').reshape([1, Config.max_length]))) label_out_np = np.arange( 0, Config.max_length, dtype='int64').reshape([1, Config.max_length]) for i in range(2, Config.batch_size + 1): label_out_np = np.vstack((label_out_np, np.arange( (i - 1) * Config.max_length, i * Config.max_length, dtype='int64').reshape([1, Config.max_length]))) with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True ocr_attention = OCRAttention("ocr_attention") if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.SGD(learning_rate=0.001) dy_param_init_value = {} for param in ocr_attention.parameters(): dy_param_init_value[param.name] = param.numpy() for epoch in range(epoch_num): for batch_id in range(batch_num): label_in = to_variable(label_in_np) label_out = to_variable(label_out_np) label_out._stop_gradient = True label_out.trainable = False img = to_variable(image_np) dy_prediction = ocr_attention(img, label_in) label_out = fluid.layers.reshape( label_out, [-1, 1], inplace=False) dy_prediction = fluid.layers.reshape( dy_prediction, [label_out.shape[0], -1], inplace=False) loss = fluid.layers.cross_entropy( input=dy_prediction, label=label_out) avg_loss = fluid.layers.reduce_sum(loss) dy_out = avg_loss.numpy() if epoch == 0 and batch_id == 0: for param in ocr_attention.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward(backward_strategy) dy_grad_value = {} for param in ocr_attention.parameters(): if param.trainable: np_array = np.array(param._ivar._grad_ivar().value() .get_tensor()) dy_grad_value[param.name + core.grad_var_suffix( )] = np_array optimizer.minimize(avg_loss) ocr_attention.clear_gradients() dy_param_value = {} for param in ocr_attention.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed # print("static start") exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ocr_attention = OCRAttention("ocr_attention") if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.SGD(learning_rate=0.001) images = fluid.layers.data( name='pixel', shape=Config.DATA_SHAPE, dtype='float32') static_label_in = fluid.layers.data( name='label_in', shape=[1], dtype='int64', lod_level=0) static_label_out = fluid.layers.data( name='label_out', shape=[1], dtype='int64', lod_level=0) static_label_out._stop_gradient = True static_label_out.trainable = False static_prediction = ocr_attention(images, static_label_in) static_prediction = fluid.layers.reshape( static_prediction, shape=[-1, Config.num_classes + 2]) cost = fluid.layers.cross_entropy( input=static_prediction, label=static_label_out) static_avg_loss = fluid.layers.reduce_sum(cost) # param_grad_list = fluid.backward.append_backward(static_avg_loss) optimizer.minimize(static_avg_loss) static_param_init_value = {} static_param_name_list = [] static_grad_name_list = [] for param in ocr_attention.parameters(): static_param_name_list.append(param.name) if param.trainable: static_grad_name_list.append(param.name + core.grad_var_suffix()) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] fetch_list = [static_avg_loss.name] fetch_list.extend(static_param_name_list) fetch_list.extend(static_grad_name_list) for epoch in range(epoch_num): for batch_id in range(batch_num): static_label_in = label_in_np static_label_out = label_out_np static_label_out = static_label_out.reshape((-1, 1)) out = exe.run(fluid.default_main_program(), feed={ "pixel": image_np, "label_in": static_label_in, "label_out": static_label_out }, fetch_list=fetch_list) static_param_value = {} static_grad_value = {} static_out = out[0] # static_test_grad = out[1] for i in range(1, len(static_param_name_list) + 1): static_param_value[static_param_name_list[i - 1]] = out[ i] grad_start_pos = len(static_param_name_list) + 1 for i in range(grad_start_pos, len(static_grad_name_list) + grad_start_pos): static_grad_value[static_grad_name_list[ i - grad_start_pos]] = out[i] self.assertTrue(np.array_equal(static_out, dy_out)) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.array_equal(value, dy_param_init_value[key])) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-20))
def test_se_resnext_float32(self): seed = 90 batch_size = train_parameters["batch_size"] batch_num = 1 epoch_num = 1 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed se_resnext = SeResNeXt("se_resnext") optimizer = optimizer_setting(train_parameters) np.random.seed(seed) import random random.seed = seed batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( paddle.batch(self.reader_decorator( paddle.dataset.flowers.train(use_xmap=False)), batch_size=batch_size, drop_last=True), places=fluid.CPUPlace()) dy_param_init_value = {} for param in se_resnext.parameters(): dy_param_init_value[param.name] = param.numpy() for epoch_id in range(epoch_num): for batch_id, data in enumerate(batch_py_reader()): if batch_id >= batch_num and batch_num != -1: break img = data[0] label = data[1] label.stop_gradient = True label.stop_gradient = True out = se_resnext(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) dy_out = avg_loss.numpy() if batch_id == 0: for param in se_resnext.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward() #dy_grad_value = {} #for param in se_resnext.parameters(): # if param.trainable: # np_array = np.array(param._ivar._grad_ivar().value() # .get_tensor()) # dy_grad_value[param.name + core.grad_var_suffix()] = np_array optimizer.minimize(avg_loss) se_resnext.clear_gradients() dy_param_value = {} for param in se_resnext.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) se_resnext = SeResNeXt("se_resnext") optimizer = optimizer_setting(train_parameters) np.random.seed(seed) import random random.seed = seed train_reader = paddle.batch( paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size, drop_last=True) img = fluid.layers.data(name='pixel', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = se_resnext(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] static_grad_name_list = [] for param in se_resnext.parameters(): static_param_name_list.append(param.name) for param in se_resnext.parameters(): if param.trainable: static_grad_name_list.append(param.name + core.grad_var_suffix()) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for epoch_id in range(epoch_num): for batch_id, data in enumerate(train_reader()): if batch_id >= batch_num and batch_num != -1: break static_x_data = np.array([ x[0].reshape(3, 224, 224) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( [batch_size, 1]) fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) fetch_list.extend(static_grad_name_list) out = exe.run(fluid.default_main_program(), feed={ "pixel": static_x_data, "label": y_data }, fetch_list=fetch_list) static_param_value = {} static_grad_value = {} static_out = out[0] param_start_pos = 1 grad_start_pos = len( static_param_name_list) + param_start_pos for i in range( param_start_pos, len(static_param_name_list) + param_start_pos): static_param_value[static_param_name_list[ i - param_start_pos]] = out[i] for i in range(grad_start_pos, len(static_grad_name_list) + grad_start_pos): static_grad_value[static_grad_name_list[ i - grad_start_pos]] = out[i] self.assertTrue(np.allclose(static_out, dy_out)) self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any())) # FIXME(Yancey1989): np.array(_ivar.value().get_tensor()) leads to memory lake #self.assertEqual(len(dy_grad_value), len(static_grad_value)) #for key, value in six.iteritems(static_grad_value): # self.assertTrue(np.allclose(value, dy_grad_value[key])) # self.assertTrue(np.isfinite(value.all())) # self.assertFalse(np.isnan(value.any())) self.assertEqual(len(dy_param_value), len(static_param_value)) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any()))
def get_param_grad_names(self): grad_names = [] for var_name in self.inputs['Params']: grad_names.append(var_name + core.grad_var_suffix()) return grad_names
def test_resnet_float32(self): seed = 90 batch_size = train_parameters["batch_size"] batch_num = 10 traced_layer = None with fluid.dygraph.guard(): paddle.manual_seed(seed) paddle.framework.random._manual_program_seed(seed) resnet = ResNet() optimizer = optimizer_setting(train_parameters, parameter_list=resnet.parameters()) np.random.seed(seed) batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( paddle.batch(self.reader_decorator( paddle.dataset.flowers.train(use_xmap=False)), batch_size=batch_size, drop_last=True), places=fluid.CPUPlace()) dy_param_init_value = {} for param in resnet.parameters(): dy_param_init_value[param.name] = param.numpy() helper = DyGraphProgramDescTracerTestHelper(self) program = None for batch_id, data in enumerate(batch_py_reader()): if batch_id >= batch_num: break img = data[0] label = data[1] label.stop_gradient = True out = None if batch_id % 5 == 0: out, traced_layer = TracedLayer.trace(resnet, img) if program is not None: self.assertTrue( is_equal_program(program, traced_layer.program)) traced_layer.save_inference_model( './infer_imperative_resnet') program = traced_layer.program else: out = resnet(img) if traced_layer is not None: resnet.eval() traced_layer._switch(is_test=True) out_dygraph = resnet(img) out_static = traced_layer([img]) traced_layer._switch(is_test=False) helper.assertEachVar(out_dygraph, out_static) resnet.train() loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) dy_out = avg_loss.numpy() if batch_id == 0: for param in resnet.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward() dy_grad_value = {} for param in resnet.parameters(): if param.trainable: np_array = np.array( param._grad_ivar().value().get_tensor()) dy_grad_value[param.name + core.grad_var_suffix()] = np_array optimizer.minimize(avg_loss) resnet.clear_gradients() dy_param_value = {} for param in resnet.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): paddle.manual_seed(seed) paddle.framework.random._manual_program_seed(seed) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) resnet = ResNet() optimizer = optimizer_setting(train_parameters) np.random.seed(seed) train_reader = paddle.batch( paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) img = fluid.layers.data(name='pixel', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] static_grad_name_list = [] for param in resnet.parameters(): static_param_name_list.append(param.name) for param in resnet.parameters(): if param.trainable: static_grad_name_list.append(param.name + core.grad_var_suffix()) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for batch_id, data in enumerate(train_reader()): if batch_id >= batch_num: break static_x_data = np.array([ x[0].reshape(3, 224, 224) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape([batch_size, 1]) if traced_layer is not None: traced_layer([static_x_data]) fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) fetch_list.extend(static_grad_name_list) out = exe.run(fluid.default_main_program(), feed={ "pixel": static_x_data, "label": y_data }, fetch_list=fetch_list) static_param_value = {} static_grad_value = {} static_out = out[0] param_start_pos = 1 grad_start_pos = len(static_param_name_list) + param_start_pos for i in range(param_start_pos, len(static_param_name_list) + param_start_pos): static_param_value[static_param_name_list[ i - param_start_pos]] = out[i] for i in range(grad_start_pos, len(static_grad_name_list) + grad_start_pos): static_grad_value[static_grad_name_list[ i - grad_start_pos]] = out[i] print("static", static_out) print("dygraph", dy_out) self.assertTrue(np.allclose(static_out, dy_out)) self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any())) self.assertEqual(len(dy_grad_value), len(static_grad_value)) for key, value in six.iteritems(static_grad_value): self.assertTrue(np.allclose(value, dy_grad_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any())) self.assertEqual(len(dy_param_value), len(static_param_value)) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key])) self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any()))
def run_dygraph(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) se_resnext = SeResNeXt() optimizer = optimizer_setting( train_parameters, parameter_list=se_resnext.parameters()) np.random.seed(seed) batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( paddle.batch(self.reader_decorator( paddle.dataset.flowers.train(use_xmap=False)), batch_size=batch_size, drop_last=True), places=fluid.CPUPlace()) dy_param_init_value = {} for param in se_resnext.parameters(): dy_param_init_value[param.name] = param.numpy() for epoch_id in range(epoch_num): for batch_id, data in enumerate(batch_py_reader()): if batch_id >= batch_num and batch_num != -1: break img = data[0] label = data[1] label.stop_gradient = True label.stop_gradient = True out = se_resnext(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) avg_loss = fluid.layers.mean(x=loss) dy_out = avg_loss.numpy() if batch_id == 0: for param in se_resnext.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward() dy_grad_value = {} for param in se_resnext.parameters(): if param.trainable: np_array = np.array( param._grad_ivar().value().get_tensor()) dy_grad_value[param.name + core.grad_var_suffix()] = np_array optimizer.minimize(avg_loss) se_resnext.clear_gradients() dy_param_value = {} for param in se_resnext.parameters(): dy_param_value[param.name] = param.numpy() return dy_out, dy_param_init_value, dy_param_value, dy_grad_value
def train_loop(): keys = ['loss', 'loss_cls', 'loss_bbox'] train_stats = TrainingStats(cfg.log_window, keys) retinanet.train() for iter_id, data in enumerate(train_reader()): start_time = time.time() gt_max_num = 0 batch_size = len(data) x = data[0] for x in data: #print(x[1].shape[0]) if x[1].shape[0] > gt_max_num: gt_max_num = x[1].shape[0] image_data = np.array( [x[0] for x in data]).astype('float32') if cfg.enable_ce: print('image: {} {}'.format(abs(image_data).sum(), image_data.shape)) gt_box_data = np.zeros([batch_size, gt_max_num, 4]) gt_label_data = np.zeros([batch_size, gt_max_num]) is_crowd_data = np.ones([batch_size, gt_max_num]) for batch_id, x in enumerate(data): gt_num = x[1].shape[0] gt_box_data[batch_id, 0:gt_num, :] = x[1] gt_label_data[batch_id, 0:gt_num] = x[2] is_crowd_data[batch_id, 0:gt_num] = x[3] gt_box_data = gt_box_data.astype('float32') gt_label_data = gt_label_data.astype('int32') is_crowd_data = is_crowd_data.astype('int32') im_info_data = np.array( [x[4] for x in data]).astype('float32') im_id_data = np.array( [x[5] for x in data]).astype('int32') outputs= retinanet('train', image_data, im_info_data, \ gt_box_data, gt_label_data, is_crowd_data) loss_cls = outputs['loss_cls'] loss_bbox = outputs['loss_bbox'] loss = outputs['loss'] score_pred = outputs['score_pred'] loc_pred = outputs['loc_pred'] cls_pred_list = outputs['cls_score_list'] bbox_pred_list = outputs['bbox_pred_list'] cls_score = outputs['cls_score'] bbox_pred = outputs['bbox_pred'] loss_cls_data = loss_cls.numpy() loss_bbox_data = loss_bbox.numpy() loss_data = loss.numpy() if cfg.use_data_parallel: loss = retinanet.scale_loss(loss) loss.backward() retinanet.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) if cfg.enable_ce: print('score_pred grad: {} {}'.format(abs(score_pred.gradient()).sum(), score_pred.gradient().shape)) print('loc_pred grad: {} {}'.format(abs(loc_pred.gradient()).sum(), loc_pred.gradient().shape)) for var in cls_pred_list: print('cls grad reshape: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) for var in bbox_pred_list: print('bbox grad reshape: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) for var in cls_score: print('cls grad original: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) for var in bbox_pred: print('bbox grad original: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) dy_grad_value = {} for param in retinanet.parameters(): if param.name == 'retnet_cls_conv_n3_fpn3/Conv2D_0.retnet_cls_conv_n3_fpn3_w' or \ param.name == 'retnet_cls_conv_n2_fpn3/Conv2D_0.retnet_cls_conv_n2_fpn3_w' or \ param.name == 'retnet_cls_conv_n1_fpn3/Conv2D_0.retnet_cls_conv_n1_fpn3_w' or \ param.name == 'retnet_cls_conv_n0_fpn3/Conv2D_0.retnet_cls_conv_n0_fpn3_w' or \ param.name == 'retnet_cls_pred_fpn3/Conv2D_0.retnet_cls_pred_fpn3_w' or \ param.name == 'conv1/Conv2D_0.conv1_weights': np_array = np.array(param._ivar._grad_ivar().value() .get_tensor()) dy_grad_value[param.name + core.grad_var_suffix( )] = [abs(np_array).sum(), np_array.shape] np_array = np.array(param._ivar.value().get_tensor()) dy_grad_value[param.name] = [abs(np_array).sum(), np_array.shape] for key, value in dy_grad_value.items(): print('{key}: {value}'.format(key = key, value = value)) retinanet.clear_gradients() outs = [loss_data, loss_cls_data, loss_bbox_data] stats = {k: v.mean() for k, v in zip(keys, outs)} train_stats.update(stats) logs = train_stats.log() lr = optimizer._global_learning_rate().numpy() end_time = time.time() strs = '{}, iter: {}, lr: {} {}, time: {:.3f}'.format( now_time(), iter_id, lr, logs, end_time - start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model(retinanet.state_dict(), "model_iter{}".format(iter_id), optimizer) if (iter_id + 1) == cfg.max_iter: break
label.stop_gradient = True out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) dy_out = avg_loss.numpy() if batch_id == 0: for param in resnet.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward() dy_grad_value = {} for param in resnet.parameters(): if param.trainable: np_array = np.array(param._grad_ivar().value().get_tensor()) dy_grad_value[param.name + core.grad_var_suffix()] = np_array optimizer.minimize(avg_loss) resnet.clear_gradients() dy_param_value = {} for param in resnet.parameters(): dy_param_value[param.name] = param.numpy() print("dygraph", dy_out)
def __impl__(self, *input): # 1. prepare inputs, outputs, attrs input_vars = [] for i, value in enumerate(input): if not isinstance(value, (np.ndarray, core.VarBase)): raise TypeError( "The type of input in TranslatedLayer must be numpy array or Variable(VarBase), but received %s." % type(value)) # NOTE: In order to unify the API, firstly convert the input to VarBase if isinstance(value, np.ndarray): var = core.VarBase( value=value, name=program_holder.input_names[i], persistable=False, place=framework._current_expected_place(), zero_copy=True) else: var = value # NOTE: we changed var name here, # but it may be an important name set by user var.name = program_holder.input_names[i] input_vars.append(var) persistable_vars = [] for var_name in program_holder.persistable_names: dy_var_name = self._persistable_var_name_dict[var_name] if dy_var_name in self._parameters: persistable_vars.append(self._parameters[dy_var_name]) elif dy_var_name in self._buffers: persistable_vars.append(self._buffers[dy_var_name]) else: raise ValueError( "The persistable variable %s is not exists in current TranslatedLayer." % var_name) output_vars = [] for var_desc in program_holder.output_decs: var = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) output_vars.append(var) # hold forward variables tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) tmp_scope_vec.value().set_scope(program_holder.scope) # 2. run program by op trace_program = program_holder.infer_program if self._is_test else program_holder.train_program end_op_index = program_holder.infer_program.block(0).op_size() framework._dygraph_tracer().trace_op(type='run_program', inputs={ 'X': input_vars, 'Params': persistable_vars }, outputs={ 'Out': output_vars, 'OutScope': tmp_scope_vec }, attrs={ 'global_block': trace_program.block(0), 'start_op_index': 0, 'end_op_index': end_op_index, 'is_test': self._is_test }) # NOTE: [ why need set param's gradient type here ] # if user set sparse gradient mode, the param's gradient # will be SelectedRows, not LoDTensor. But tracer will just # set param grad VarBase by forward VarBase(LoDTensor) # If we don't change grad_var type here, RunProgramOp need # transform SelectedRows to LoDTensor forcibly, it may not # be user wanted result. for persistable_var in persistable_vars: grad_var_name = var.name + core.grad_var_suffix() grad_var = trace_program.block(0).find_var( cpt.to_bytes(grad_var_name)) # NOTE: cannot find var desc maybe not problem, # such as in batch_norm if grad_var is None: continue persistable_var._set_grad_type(grad_var.type()) # 3. prepare output, keep same form with inputs outs = output_vars if len(output_vars) == 1: outs = output_vars[0] return outs