def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]): image = paddle.static.data( shape=[batch_size] + image_shape, dtype='float32', name='image') model = BatchNormActNet() pred_out = model(image) loss = paddle.mean(pred_out) optimizer = paddle.optimizer.Adam(learning_rate=1e-3) dist_strategy = fleet.DistributedStrategy() dist_strategy.fuse_all_reduce_ops = False dist_strategy.without_graph_optimization = True dist_strategy.amp = True dist_strategy.amp_configs = { "init_loss_scaling": 32768, "use_dynamic_loss_scaling": True, } fleet.init(is_collective=True, strategy=dist_strategy) optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(loss) rank = paddle.distributed.get_rank() def reader(): seed = int(os.environ.get("SEED", 0)) np.random.seed(seed + rank) for _ in range(10): image_np = np.random.random(size=image.shape).astype('float32') yield image_np, main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() return main_program, startup_program, [image], [loss], reader
def init_distributed_infer_env(self, exe, loss, role_maker=None, dirname=None): import paddle.distributed.fleet as fleet if fleet.fleet._runtime_handle is None: fleet.init(role_maker=role_maker) fake_optimizer = paddle.optimizer.SGD() strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer( fake_optimizer, strategy=strategy) optimizer.minimize( loss, startup_program=self.origin_startup_program) if fleet.is_server(): fleet.init_server(dirname=dirname) fleet.run_server() else: exe.run(paddle.static.default_startup_program()) fleet.init_worker() self._init_dense_params(exe, dirname) global_startup_program = paddle.static.default_startup_program() global_startup_program = self.origin_startup_program global_main_program = paddle.static.default_main_program() global_main_program = self.origin_main_program
def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): # Input data device_id = 0 if dist_strategy: fleet.init(is_collective=True) with fluid.device_guard("gpu:0"): images = fluid.layers.data( name='pixel', shape=[1, 28, 28], dtype=DTYPE) label = fluid.layers.data(name='label', shape=[1], dtype='int64') if dist_strategy: data_loader = fluid.io.DataLoader.from_generator( feed_list=[images, label], capacity=64, use_double_buffer=False, iterable=False) # Train program predict = cnn_model(images) with fluid.device_guard("gpu:0"): cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) # Evaluator with fluid.device_guard("gpu:0"): batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size_tensor) inference_program = fluid.default_main_program().clone() base_lr = self.lr passes = [30, 60, 80, 90] steps_per_pass = 10 bd = [steps_per_pass * p for p in passes] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) # Reader train_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) if dist_strategy: strategy = fleet.DistributedStrategy() strategy.pipeline = True strategy.pipeline_configs = { 'schedule_mode': 'F-then-B', 'micro_batch_size': batch_size } dist_opt = fleet.distributed_optimizer( optimizer=opt, strategy=strategy) dist_opt.minimize(avg_cost) else: opt.minimize(avg_cost) if dist_strategy: return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader else: return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def test_fleet_get_applied_optimizer(self): input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) fleet.init(is_collective=True) meta_list = fleet._get_applied_meta_list() graph_list = fleet._get_applied_graph_list() # not called minimize function self.assertEqual(len(meta_list), 0) self.assertEqual(len(graph_list), 0) strategy = fleet.DistributedStrategy() optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) meta_list = fleet._get_applied_meta_list() graph_list = fleet._get_applied_graph_list() self.assertEqual(len(meta_list), 0) self.assertEqual(len(graph_list), 1)
def _init_distributed_strategy(self): """Initialize distributed strategy.""" exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 exec_strategy.num_iteration_per_drop_scope = 1 dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 dist_strategy.fuse_all_reduce_ops = True if self.use_recompute: dist_strategy.recompute = True if self.use_amp: dist_strategy.amp = True dist_strategy.amp_configs = { "custom_white_list": ["softmax", "layer_norm", "gelu"], "init_loss_scaling": self.amp_loss_scaling } if self.use_sharding: dist_strategy.sharding = True dist_strategy.sharding_configs = { "segment_broadcast_MB": 32, "dp_degree": self.dp_degree, "sharding_degree": self.sharding_degree, "mp_degree": self.mp_degree, "pp_degree": self.pp_degree } self.dist_strategy = dist_strategy self._init_build_strategy() print(self.dist_strategy) return
def __init__(self, model=None, inputs_spec=None, labels_spec=None, cluster=None, strategy=None): self.model = model self.inputs_spec = self._validate_spec(inputs_spec) self.labels_spec = self._validate_spec(labels_spec) self.cluster = cluster # if self.cluster is None: # self.cluster = get_default_cluster() self.strategy = strategy if self.strategy is None: self.strategy = fleet.DistributedStrategy() self._executor = None self._cur_rank = paddle.distributed.get_rank() self._nranks = paddle.distributed.get_world_size() self._saver = DistributedSaver() self._logger = get_logger(logging.INFO) self._default_strategy = None self._orig_main_prog = static.default_main_program() self._orig_startup_prog = static.default_startup_program() self._orig_dist_context = get_default_distributed_context() self._dist_contexts = {} self._serial_main_progs = {} self._serial_startup_progs = {} self._dist_main_progs = defaultdict(dict) # dist main programs self._dist_startup_progs = defaultdict(dict) # dist startup programs self._feed_vars = {} self._fetch_vars = {}
def get_dist_prog(train_program, startup_program, dist_context, rank_id): loss, train_program, startup_program = mlp_forward(train_program, startup_program) fleet._user_defined_strategy = fleet.DistributedStrategy() fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context # serial forward & backward completion complete_train_program = auto.complete_annotation(train_program, dist_context) params_grads = parallelizer._generate_backward(complete_train_program, startup_program, loss, parameter_list=None, no_grad_set=None, callbacks=None) # logical partition partitioner = Partitioner(dist_context, rank_id) auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition( complete_train_program, startup_program, params_grads) partitioned_optimize_ops = parallelizer._apply_optimize( auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads) return auto_parallel_main_prog, auto_parallel_startup_prog
def train(): global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() loss, train_program, start_program, loader = mlp_pretrain_forward( train_program, start_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) places = static.cuda_places() loader.set_batch_generator(batch_generator_creator(), places=places) exe = paddle.static.Executor(places[0]) exe.run(distributed_startup_program) for data in loader(): exe.run(distributed_main_program, feed=data, fetch_list=[loss])
def get_dist_prog(train_program, startup_program, dist_context, rank_id): loss, train_program, startup_program = mlp_forward(train_program, startup_program) fleet._user_defined_strategy = fleet.DistributedStrategy() fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context # auto completion completer = Completer(dist_context) complete_train_program = completer.complete_forward_annotation( train_program) dist_context.block_state.parse_forward_blocks(complete_train_program) params_grads = parallelizer._generate_backward( complete_train_program, startup_program, loss, parameter_list=None, no_grad_set=None, callbacks=None) partitioner = Partitioner(dist_context, rank_id) dist_train_program, dist_startup_prog, dist_params_grads = partitioner.partition( complete_train_program, startup_program, params_grads) partitioned_optimize_ops = parallelizer._apply_optimize( dist_train_program, dist_startup_prog, dist_params_grads) resharder = Resharder(dist_train_program, dist_startup_prog, rank_id, dist_context, dist_params_grads) resharder.reshard() return dist_train_program, dist_startup_prog
def dist_optimizer(args, optimizer): """ Create a distributed optimizer based on a normal optimizer Args: args: optimizer: a normal optimizer Returns: optimizer: a distributed optimizer """ build_strategy, exec_strategy = create_strategy() dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.build_strategy = build_strategy dist_strategy.fuse_grad_size_in_MB = 16 if args.use_amp: dist_strategy.amp = True dist_strategy.amp_configs = { 'custom_white_list': ['softmax', 'layer_norm', 'gelu'], 'init_loss_scaling': args.scale_loss, } optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def test_single_gpu(self): paddle.enable_static() fleet.init(is_collective=True) sharding_program = paddle.static.Program() sharding_startup_program = paddle.static.Program() strategy = fleet.DistributedStrategy() strategy.without_graph_optimization = True with fluid.program_guard(sharding_program, sharding_startup_program): with fluid.unique_name.guard(): input_x = paddle.static.data(name="x", shape=[None, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[None, 1], dtype='int64') cost = self.mlp(input_x=input_x, input_y=input_y) output_name = cost.name optimizer = fleet.distributed_optimizer( fluid.optimizer.Adam(), strategy) optimizer.minimize(cost) trainer_id = fleet.worker_index() exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id)) rank = fleet.worker_index() exe.run(sharding_startup_program) exe.run(program=sharding_program, feed=self.gen_data())
def test_util_base(self): import paddle.distributed.fleet as fleet util = fleet.UtilBase() strategy = fleet.DistributedStrategy() util._set_strategy(strategy) role_maker = None # should be fleet.PaddleCloudRoleMaker() util._set_role_maker(role_maker)
def get_dist_prog_with_parallelizer(train_program, startup_program, dist_context): global _global_process_mesh dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) loss, train_program, startup_program = mlp_forward(train_program, startup_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, startup_program) return distributed_main_program, distributed_startup_program
def dist_optimizer(args, optimizer): """ Create a distributed optimizer based on a normal optimizer """ build_strategy, exec_strategy = create_strategy(args) dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.build_strategy = build_strategy dist_strategy.fuse_grad_size_in_MB = 16 if args.use_amp: dist_strategy.amp = True custom_black_list = ['lookup_table', 'lookup_table_v2' ] if args.use_pure_fp16 else None dist_strategy.amp_configs = { 'custom_white_list': ['softmax', 'layer_norm', 'gelu'], 'init_loss_scaling': args.scale_loss, 'custom_black_list': custom_black_list, 'use_pure_fp16': args.use_pure_fp16 } if args.gradient_merge_steps > 1: dist_strategy.gradient_merge = True dist_strategy.gradient_merge_configs = { 'k_steps': args.gradient_merge_steps } optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) return optimizer
def apply_passes(self): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True dist_strategy.sharding = True dist_strategy.sharding_configs = { "sharding_degree": 2, "stage": 2, } fleet.init(is_collective=True, strategy=dist_strategy)
def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 strategy.hybrid_configs = { "dp_degree": 1, "mp_degree": self.model_parallel_size, "pp_degree": 1 } fleet.init(is_collective=True, strategy=strategy)
def dist_optimizer(args, topo): default_global_batch_size = topo.data_info.size * args.micro_batch_size if args.global_batch_size is None: args.global_batch_size = default_global_batch_size bsz_per_dp = args.global_batch_size // topo.data_info.size micro_batch_size = args.micro_batch_size assert args.global_batch_size % micro_batch_size == 0, "cannot do gradient accumulate, global_batch_size: {} micro_batch_size: {}".format( args.global_batch_size, micro_batch_size) acc_steps = bsz_per_dp // micro_batch_size exec_strategy = paddle.fluid.ExecutionStrategy() exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = 1 dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 dist_strategy.recompute = args.use_recompute dist_strategy.pipeline = args.pp_degree > 1 if args.use_amp: dist_strategy.amp = True dist_strategy.amp_configs = { "custom_white_list": [ 'softmax', 'layer_norm', 'gelu', "fused_softmax_mask_upper_triangle", "elementwise_add" ], "custom_black_list": ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"], "init_loss_scaling": 32768, "use_dynamic_loss_scaling": True, "use_pure_fp16": args.amp_level == "O2", "use_fp16_guard": False } if args.use_sharding: dist_strategy.sharding = True dist_strategy.sharding_configs = { "segment_broadcast_MB": 32, "sharding_degree": args.sharding_degree, "mp_degree": args.mp_degree, "pp_degree": args.pp_degree, "dp_degree": args.dp_degree, "optimize_offload": False, } if args.pp_degree > 1: dist_strategy.pipeline_configs = { "schedule_mode": "1F1B", "micro_micro_batch_size": micro_batch_size, "accumulate_steps": acc_steps, } else: assert acc_steps == 1, "Only support accumulate steps in piplinemode. Please set you global_batch_size={}".format( default_global_batch_size) return dist_strategy
def setUp(self): strategy = fleet.DistributedStrategy() self.pipeline_parallel_size = 2 strategy.hybrid_configs = { "dp_degree": 1, "mp_degree": 1, "pp_degree": self.pipeline_parallel_size } fleet.init(is_collective=True, strategy=strategy) self.hcg = fleet.get_hybrid_communicate_group()
def get_model(self, place, gradient_merge, batch_size, max_step): paddle.seed(2021) random.seed(2021) np.random.seed(2021) hidden_size = 128 global _global_parallel_strategy global _global_process_mesh world_size = paddle.distributed.get_world_size() if world_size == 1: _global_parallel_strategy = "dp" _global_process_mesh = auto.ProcessMesh([0]) elif world_size == 2: _global_parallel_strategy = "dp" _global_process_mesh = auto.ProcessMesh([0, 1]) train_program = static.Program() startup_program = static.Program() dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True #if gradient_merge: # dist_strategy.gradient_merge = True # dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} fleet.init(is_collective=True, strategy=dist_strategy) with static.program_guard(train_program, startup_program), \ utils.unique_name.guard(): input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') input.stop_gradient = False loss = mlp_forward(input, label, hidden_size) optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01) #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program) input_data = np.random.random(size=(128, hidden_size)).astype('float32') label_data = np.random.random(size=(128, 1)).astype('float32') def reader(): for i in range(max_step): x_data = input_data[i * batch_size:(i + 1) * batch_size, :] y_data = label_data[i * batch_size:(i + 1) * batch_size, :] yield x_data, y_data return dist_main_prog, dist_startup_prog, [input, label], [loss], reader
def test_util_factory(self): import paddle.distributed.fleet as fleet factory = fleet.base.util_factory.UtilFactory() strategy = fleet.DistributedStrategy() role_maker = None # should be fleet.PaddleCloudRoleMaker() optimize_ops = [] params_grads = [] context = {} context["role_maker"] = role_maker context["valid_strategy"] = strategy util = factory._create_util(context) self.assertEqual(util.role_maker, None)
def apply_passes(self, main_prog, startup_prog): #self._config["params_grads"] = self._params_grads #pass_context = PassContext() #auto_parallel_gradient_merge_pass = new_pass( # "auto_parallel_gradient_merge_pass", self._config) #auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog], # pass_context) dist_strategy = fleet.DistributedStrategy() dist_strategy.gradient_merge = True dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy)
def train(): dist_strategy = fleet.DistributedStrategy() # init parallel optimizer dist_strategy.auto_search = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() place = paddle.set_device("gpu") gpus = [0, 1] batch_size = 8 sequence_len = 512 vocab_size = 1000 train_program, start_program, loss, gen_data = get_gpt_model( train_program, start_program, place, batch_size, sequence_len, vocab_size) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) places = static.cuda_places() exe = paddle.static.Executor(places[0]) exe.run(distributed_startup_program) for step in range(10): tokens, position_ids, attention_mask, labels, loss_mask = gen_data() if loss.name in distributed_main_program.global_block().vars: loss_print, = exe.run(distributed_main_program, feed={ "tokens": tokens, "position_ids": position_ids, "attention_mask": attention_mask, "labels": labels, "loss_mask": loss_mask }, fetch_list=[loss]) print("step: %s, loss: %f" % (step, loss_print[0])) else: exe.run(distributed_main_program, feed={ "tokens": tokens, "position_ids": position_ids, "attention_mask": attention_mask, "labels": labels, "loss_mask": loss_mask }) print("step: %s, loss: %s" % (step, "None"))
def _set_strategy(self, args): """配置运行的distributed_strategy, build_strategy 配置在do_training中""" self.dist_strategy = fleet.DistributedStrategy() if args.run_params["mode"] == "sync": self.dist_strategy.a_sync = False elif args.run_params["mode"] == "async": self.dist_strategy.a_sync = True elif args.run_params["mode"] == "geo_async": self.dist_strategy.a_sync = True self.dist_strategy.a_sync_configs = {"k_steps": 2} elif args.run_params["mode"] == "auto": self.dist_strategy.auto = True
def boundary_net(self, main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog): fleet.init(is_collective=True) x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32') with paddle.static.device_guard('gpu:0'): linear = fluid.Linear(4, 8, bias_attr=False) out = linear(x) with paddle.static.device_guard('gpu:1'): linear = fluid.Linear(8, 5, bias_attr=False) out = linear(out) avg_cost = paddle.mean(out) strategy = fleet.DistributedStrategy() return avg_cost, strategy
def test_dygraph_fleet_api(self): import paddle.distributed.fleet as fleet import paddle.distributed as dist strategy = fleet.DistributedStrategy() strategy.amp = True strategy.recompute = True fleet.init(is_collective=True, strategy=strategy) net = paddle.nn.Sequential( paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2)) net = dist.fleet.distributed_model(net) data = np.random.uniform(-1, 1, [30, 10]).astype('float32') data = paddle.to_tensor(data) net(data)
def test_allgather(self): train_program = paddle.static.Program() startup_program = paddle.static.Program() process_mesh = auto.ProcessMesh(mesh=[0, 3]) with static.program_guard(train_program, startup_program): x = paddle.static.data(name="x", shape=[4, 4], dtype='float32') x = auto.shard_tensor(x, dist_attr={ "process_mesh": process_mesh, "dims_mapping": [0, -1] }) w = paddle.static.data(name="w", shape=[4, 4], dtype='float32') w = auto.shard_tensor(w, dist_attr={ "process_mesh": process_mesh, "dims_mapping": [-1, -1] }) # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, { # x.name: [-1, -1], # w.name: [-1, -1] # }, **{"x": x, # "y": w})[0] y = paddle.distributed.shard_op(paddle.matmul, dist_attr={ "process_mesh": process_mesh, x: { "dims_mapping": [-1, -1] }, w: { "dims_mapping": [-1, -1] } })(x, w)[0] rank_id = 0 dist_context = DistributedContext() dist_strategy = fleet.DistributedStrategy() partitioner = Partitioner(dist_context, rank_id) completer = Completer(dist_context) complete_train_program = completer.complete_forward_annotation( train_program) dist_context.block_state.parse_forward_blocks(complete_train_program) partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition( complete_train_program, startup_program, []) resharder = Resharder(partitioned_main_prog, partitioned_startup_prog, rank_id, dist_context, partitioned_params_grads) resharder.reshard() # the x should not be slice self.assertTrue(check_allgather(partitioned_main_prog))
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() fake_num_nodes = 1 py_reader, loss = StaticSkipGramModel( fake_num_nodes, args.neg_num, args.embed_size, sparse_embedding=True, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = build_graph(args) # bind gen train_ds = ShardedDataset(graph.nodes, args.epoch) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.cpu_batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) py_reader.set_batch_generator(lambda: data_loader) train_loss = train(exe, paddle.static.default_main_program(), py_reader, loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def get_distributed_program(): train_program = static.Program() startup_program = static.Program() dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) loss, train_program, startup_program = mlp_forward(train_program, startup_program) optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) _, _, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program) return dist_main_prog, dist_startup_prog, loss
def main(args): paddle.set_device("cpu") paddle.enable_static() role = role_maker.PaddleCloudRoleMaker() fleet.init(role) if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel( num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker()
def main(args): paddle.enable_static() paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) fleet.init(is_collective=True) graph = load(args.dataset) loss = StaticSkipGramModel(graph.num_nodes, args.neg_num, args.embed_size, num_emb_part=args.num_emb_part, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate) dist_strategy = fleet.DistributedStrategy() dist_strategy.sharding = True dist_strategy.sharding_configs = { "segment_anchors": None, "sharding_segment_strategy": "segment_broadcast_MB", "segment_broadcast_MB": 32, "sharding_degree": int(paddle.distributed.get_world_size()), } optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())