def build_model_optimizer(self, Optimizer="adam"): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() sharding_id = hcg.get_sharding_parallel_rank() dp_id = hcg.get_data_parallel_rank() rank_id = dist.get_rank() np_fc1 = np.random.random_sample((hidden_size, inner_size)) np_fc2 = np.random.random_sample((inner_size, hidden_size)) model_a = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2) optimizer_a = self.build_optimizer(model_a, strategy=self.strategy, is_sharding=True, Optimizer=Optimizer) model_a = fleet.distributed_model(model_a) optimizer_a = fleet.distributed_optimizer(optimizer_a) model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2) optimizer_b = self.build_optimizer(model_b, strategy=self.strategy, is_sharding=False, Optimizer=Optimizer) return model_a, optimizer_a, model_b, optimizer_b
def test_single_run_collective_minimize(self): paddle.enable_static() input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.mean(x=cost) fleet.init(is_collective=True) optimizer = fluid.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(avg_cost) place = fluid.CUDAPlace( 0) if paddle.fluid.is_compiled_with_cuda() else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(paddle.static.default_startup_program()) for i in range(10): cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name]) print("cost of step[{}] = {}".format(i, cost_val))
def dist_optimizer(args, optimizer): """ Create a distributed optimizer based on a normal optimizer Args: args: optimizer: a normal optimizer Returns: optimizer: a distributed optimizer """ build_strategy, exec_strategy = create_strategy() dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.build_strategy = build_strategy dist_strategy.fuse_grad_size_in_MB = 16 if args.use_amp: dist_strategy.amp = True custom_black_list = ['lookup_table', 'lookup_table_v2' ] if args.use_pure_fp16 else None dist_strategy.amp_configs = { 'custom_white_list': ['softmax', 'layer_norm', 'gelu'], 'init_loss_scaling': args.scale_loss, 'custom_black_list': custom_black_list, 'use_pure_fp16': args.use_pure_fp16 } if args.gradient_merge_steps > 1: dist_strategy.gradient_merge = True dist_strategy.gradient_merge_configs = { 'k_steps': args.gradient_merge_steps } optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def test_recompute_optimizer(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.recompute = True strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_communicator_sync(self): os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ "127.0.0.1:36001,127.0.0.2:36001" fleet.init(role_maker.PaddleCloudRoleMaker()) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def test_trainer_desc_config(self): os.environ["TRAINING_ROLE"] = "TRAINER" import paddle.distributed.fleet as fleet fleet.init(role_maker.PaddleCloudRoleMaker()) x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) avg_cost = paddle.fluid.layers.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() config = { "dump_fields_path": "dump_data", "dump_fields": ["xxx", "yyy"], "dump_param": [] } strategy.trainer_desc_configs = config optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) program = paddle.static.default_main_program() self.assertEqual(program._fleet_opt["dump_fields_path"], "dump_data") self.assertEqual(len(program._fleet_opt["dump_fields"]), 2) self.assertEqual(len(program._fleet_opt["dump_param"]), 0) self.assertEqual(program._fleet_opt["mpi_size"], int(os.environ["PADDLE_TRAINERS_NUM"]))
def init_distributed_infer_env(self, exe, loss, role_maker=None, dirname=None): import paddle.distributed.fleet as fleet if fleet.fleet._runtime_handle is None: fleet.init(role_maker=role_maker) fake_optimizer = paddle.optimizer.SGD() strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(fake_optimizer, strategy=strategy) optimizer.minimize(loss, startup_program=self.origin_startup_program) if fleet.is_server(): fleet.init_server(dirname=dirname) fleet.run_server() else: exe.run(paddle.static.default_startup_program()) fleet.init_worker() self._init_dense_params(exe, dirname) global_startup_program = paddle.static.default_startup_program() global_startup_program = self.origin_startup_program global_main_program = paddle.static.default_main_program() global_main_program = self.origin_main_program
def node_func(): import paddle.distributed.fleet as fleet fleet.init(is_collective=True) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data( name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace()) exe.run(paddle.fluid.default_startup_program())
def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): # Input data device_id = 0 if dist_strategy: fleet.init(is_collective=True) with fluid.device_guard("gpu:0"): images = fluid.layers.data( name='pixel', shape=[1, 28, 28], dtype=DTYPE) label = fluid.layers.data(name='label', shape=[1], dtype='int64') if dist_strategy: data_loader = fluid.io.DataLoader.from_generator( feed_list=[images, label], capacity=64, use_double_buffer=False, iterable=False) # Train program predict = cnn_model(images) with fluid.device_guard("gpu:0"): cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) # Evaluator with fluid.device_guard("gpu:0"): batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size_tensor) inference_program = fluid.default_main_program().clone() base_lr = self.lr passes = [30, 60, 80, 90] steps_per_pass = 10 bd = [steps_per_pass * p for p in passes] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) # Reader train_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) if dist_strategy: strategy = fleet.DistributedStrategy() strategy.pipeline = True strategy.pipeline_configs = { 'schedule_mode': 'F-then-B', 'micro_batch_size': batch_size } dist_opt = fleet.distributed_optimizer( optimizer=opt, strategy=strategy) dist_opt.minimize(avg_cost) else: opt.minimize(avg_cost) if dist_strategy: return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader else: return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def test_with_asp_sharding(self): fleet.init(is_collective=True) train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy, input_x, input_y = self.net( train_prog, startup_prog) with fluid.program_guard(train_prog, startup_prog): optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) if paddle.fluid.is_compiled_with_cuda(): place = fluid.CUDAPlace( int(os.environ.get('FLAGS_selected_gpus', 0))) else: place = fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) exe.run(startup_prog) sparsity.prune_model(train_prog) data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) exe.run(train_prog, feed=feeder.feed([data])) for param in train_prog.global_block().all_parameters(): if ASPHelper._is_supported_layer(train_prog, param.name): mat = np.array(fluid.global_scope().find_var( param.name).get_tensor()) self.assertTrue( paddle.fluid.contrib.sparsity.check_sparsity(mat.T, n=2, m=4))
def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() dp_id = hcg.get_data_parallel_rank() pp_id = hcg.get_stage_id() rank_id = dist.get_rank() topology = hcg.topology() set_random_seed(1024, dp_id, rank_id) model = ModelPipe(topology) scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer = paddle.optimizer.SGD(learning_rate=scheduler, parameters=model.parameters()) model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) for step_id in range(5): x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) x = paddle.to_tensor(x_data) x.stop_gradient = True e_loss = model.eval_batch([x, x], True) loss = model.train_batch([x, x], optimizer, scheduler) # TODO(shenliang03) add utest for loss if pp_id != 0: np.testing.assert_allclose(loss.numpy(), e_loss.numpy())
def test_pipeline_amp_optimizer(self): """ test pipeline& with device:all """ role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.amp = True strategy.pipeline = True strategy.pipeline_configs = { 'micro_batch_size': 1, 'accumulate_steps': 2 } train_prog, startup_prog = static.Program(), static.Program() with static.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): avg_cost = self.net() optimizer = paddle.fluid.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) ops = train_prog._pipeline_opt['section_program'].global_block().ops ops = [op.type for op in ops] self.assertEqual(ops.count('send_v2'), 1) self.assertEqual(ops.count('recv_v2'), 1)
def test_a_sync_optimizer2(self): os.environ["TRAINING_ROLE"] = "TRAINER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) self.assertTrue(fleet._final_strategy().a_sync) a_sync_configs = fleet._final_strategy().a_sync_configs self.assertTrue(a_sync_configs['k_steps'] == 800)
def test_with_asp(self): fleet.init(is_collective=True) self.optimizer = paddle.incubate.asp.decorate(self.optimizer) paddle.incubate.asp.prune_model(self.layer) self.optimizer = fleet.distributed_optimizer(self.optimizer) self.layer = fleet.distributed_model(self.layer) imgs = paddle.to_tensor(np.random.randn(64, 32), dtype='float32', place=self.place, stop_gradient=False) labels = paddle.to_tensor(np.random.randint(10, size=(64, 1)), dtype='float32', place=self.place, stop_gradient=False) loss_fn = paddle.nn.MSELoss(reduction='mean') output = self.layer(imgs) loss = loss_fn(output, labels) loss.backward() self.optimizer.step() self.optimizer.clear_grad() for param in self.layer.parameters(): if ASPHelper._is_supported_layer( paddle.static.default_main_program(), param.name): mat = param.numpy() self.assertTrue( paddle.fluid.contrib.sparsity.check_sparsity(mat.T, n=2, m=4))
def test_pipeline_optimizer(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with paddle.fluid.device_guard("gpu:0"): input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') with paddle.fluid.device_guard("gpu:1"): fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.pipeline = True strategy.pipeline_configs = {'micro_batch': 2} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 6) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def test_fleet_get_applied_optimizer(self): input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) fleet.init(is_collective=True) meta_list = fleet._get_applied_meta_list() graph_list = fleet._get_applied_graph_list() # not called minimize function self.assertEqual(len(meta_list), 0) self.assertEqual(len(graph_list), 0) strategy = fleet.DistributedStrategy() optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) meta_list = fleet._get_applied_meta_list() graph_list = fleet._get_applied_graph_list() self.assertEqual(len(meta_list), 0) self.assertEqual(len(graph_list), 1)
def get_dist_prog_with_parallelizer(train_program, startup_program, dist_context): global _global_process_mesh dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) loss, train_program, startup_program = mlp_forward(train_program, startup_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, startup_program) return distributed_main_program, distributed_startup_program
def optimizer(self, loss, strategy, train_prog, startup_prog, name='momentum', regularization=None, grad_clip=None): with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): if name == 'momentum': optimizer = paddle.fluid.optimizer.Momentum( learning_rate=0.01, momentum=0.9, regularization=regularization, grad_clip=grad_clip) elif name == 'adam': optimizer = paddle.fluid.optimizer.Adam( learning_rate=0.01, regularization=regularization, grad_clip=grad_clip) elif name == 'adamw': optimizer = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.01, grad_clip=grad_clip) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss)
def test_a_sync_optimizer_pserver(self): os.environ["TRAINING_ROLE"] = "PSERVER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) avg_cost = paddle.fluid.layers.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) fleet.init_server()
def build_optimizer(self, avg_cost, strategy): use_grad_clip = int(os.getenv('GRAD_CLIP', 0)) grad_clip = None if use_grad_clip: # 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm if use_grad_clip == 1: grad_clip = paddle.nn.ClipGradByValue(min=-5.0, max=5.0) elif use_grad_clip == 2: grad_clip = paddle.nn.ClipGradByNorm(2.0) elif use_grad_clip == 3: grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) use_decay = int(os.getenv("USE_DECAY", "0")) if use_decay: scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=LEARNING_RATE, gamma=0.999, verbose=True) optimizer = fluid.optimizer.SGD(scheduler, grad_clip=grad_clip) """ # learning rate decay method before 2.0 optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=LEARNING_RATE, decay_steps=500, decay_rate=0.969, staircase=True)) """ else: optimizer = fluid.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_a_sync_optimizer_pserver(self): os.environ["TRAINING_ROLE"] = "PSERVER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False} optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) avg_cost = paddle.fluid.layers.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 0) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def dist_optimizer(args, optimizer): """ Create a distributed optimizer based on a normal optimizer Args: args: optimizer: a normal optimizer Returns: optimizer: a distributed optimizer """ build_strategy, exec_strategy = create_strategy() dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.build_strategy = build_strategy dist_strategy.fuse_grad_size_in_MB = 16 if args.use_amp: dist_strategy.amp = True dist_strategy.amp_configs = { 'custom_white_list': ['softmax', 'layer_norm', 'gelu'], 'init_loss_scaling': args.scale_loss, } optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def test_single_run_ps_minimize(self): paddle.enable_static() input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.mean(x=cost) fleet.init() strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(paddle.static.default_startup_program()) step = 10 for i in range(step): cost_val = exe.run(program=fluid.default_main_program(), feed=self.gen_data(), fetch_list=[avg_cost.name]) print("worker_index: %d, step%d cost = %f" % (fleet.worker_index(), i, cost_val[0]))
def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]): image = paddle.static.data( shape=[batch_size] + image_shape, dtype='float32', name='image') model = BatchNormActNet() pred_out = model(image) loss = paddle.mean(pred_out) optimizer = paddle.optimizer.Adam(learning_rate=1e-3) dist_strategy = fleet.DistributedStrategy() dist_strategy.fuse_all_reduce_ops = False dist_strategy.without_graph_optimization = True dist_strategy.amp = True dist_strategy.amp_configs = { "init_loss_scaling": 32768, "use_dynamic_loss_scaling": True, } fleet.init(is_collective=True, strategy=dist_strategy) optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(loss) rank = paddle.distributed.get_rank() def reader(): seed = int(os.environ.get("SEED", 0)) np.random.seed(seed + rank) for _ in range(10): image_np = np.random.random(size=image.shape).astype('float32') yield image_np, main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() return main_program, startup_program, [image], [loss], reader
def test(self): os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ[ "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001" os.environ["TRAINING_ROLE"] = "PSERVER" role = role_maker.PaddleCloudRoleMaker() fleet.init(role) loss, acc, _ = self.net() strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True configs = {} configs['__emb__'] = { "table_parameters.__emb__.accessor.embed_sgd_param.name": "SparseNaiveSGDRule", "table_parameters.__emb__.accessor.embedx_sgd_param.name": "SparseAdamSGDRule", } strategy.sparse_table_configs = configs optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) fleet.init_server()
def train(): from auto_parallel_relaunch_model import mlp_pretrain_forward from auto_parallel_relaunch_model import batch_generator_creator dist_strategy = fleet.DistributedStrategy() # init parallel optimizer dist_strategy.auto_search = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() loss, train_program, start_program, loader = mlp_pretrain_forward( train_program, start_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) places = static.cuda_places() loader.set_batch_generator(batch_generator_creator(), places=places) exe = paddle.static.Executor(places[0]) exe.run(distributed_startup_program) for data in loader(): exe.run(distributed_main_program, feed=data)
def create_optimizer(self, strategy=None): optimizer = paddle.optimizer.Adam(learning_rate=self.learning_rate, lazy_mode=True) if strategy != None: import paddle.distributed.fleet as fleet optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(self._cost)
def train(): global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() loss, train_program, start_program, loader = mlp_pretrain_forward( train_program, start_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) places = static.cuda_places() loader.set_batch_generator(batch_generator_creator(), places=places) exe = paddle.static.Executor(places[0]) exe.run(distributed_startup_program) for data in loader(): exe.run(distributed_main_program, feed=data, fetch_list=[loss])