def _build_trainer_programs(self): add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", self.attrs) add_lr_decay_table_pass.apply([], [], self.pass_ctx) distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs) distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs) fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx) ps_gpu_pass = new_pass("ps_gpu_pass", self.attrs) ps_gpu_pass.apply([self.cloned_main], [None], self.pass_ctx) ps_transpile_pass = new_pass("ps_transpile_pass", self.attrs) ps_transpile_pass.apply([self.cloned_main], [self.cloned_startup], self.pass_ctx) self.attrs['origin_main_program'] = self.cloned_main self.attrs['origin_startup_program'] = self.cloned_startup if self.launch_barrier and self.launch_barrier_flag: wait_server_ready(self.server_endpoints) return
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager([ new_pass("build_cinn"), new_pass("fuse_elewise_add_act"), ]) pass_manager.apply([main_prog], [startup_prog]) print(pass_manager.names)
def _apply_pre_optimization(self, main_program, startup_program, loss, optimizer, params_grads): if self._strategy is None: return # apply amp pass if self._strategy.amp: config = copy.deepcopy(self._strategy.amp_configs) config["dist_context"] = self._dist_context config["params_grads"] = params_grads config["loss"] = loss config["input_data"] = self._dist_context.serial_feed_vars["inputs"] \ + self._dist_context.serial_feed_vars["labels"] if config["use_pure_fp16"]: config["base_opt"] = optimizer auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config) auto_parallel_fp16_pass.apply([main_program], [startup_program], self._pass_context) else: auto_parallel_amp_pass = new_pass("auto_parallel_amp", config) auto_parallel_amp_pass.apply([main_program], [startup_program], self._pass_context) # apply recompute pass if self._strategy.recompute: config = copy.deepcopy(self._strategy.recompute_configs) config["dist_context"] = self._dist_context config["no_grad_set"] = None config["loss"] = loss auto_parallel_recompute_pass = new_pass("auto_parallel_recompute", config) auto_parallel_recompute_pass.apply([main_program], [startup_program], self._dist_context)
def _apply_pre_optimization_passes(self, main_program, startup_program, loss, params_grads, no_grad_set): # apply amp pass if self._dist_strategy.amp: config = copy.deepcopy(self._dist_strategy.amp_configs) config["dist_context"] = self._dist_context config["params_grads"] = params_grads config["loss"] = loss if config["use_pure_fp16"]: config["base_opt"] = self._optimizer auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config) auto_parallel_fp16_pass.apply([main_program], [startup_program], self._pass_context) else: auto_parallel_amp_pass = new_pass("auto_parallel_amp", config) auto_parallel_amp_pass.apply([main_program], [startup_program], self._pass_context) # apply recompute pass if self._dist_strategy.recompute: config = copy.deepcopy(self._dist_strategy.recompute_configs) config["dist_context"] = self._dist_context config["no_grad_set"] = copy.deepcopy(no_grad_set) config["loss"] = loss auto_parallel_recompute_pass = new_pass("auto_parallel_recompute", config) auto_parallel_recompute_pass.apply([main_program], [startup_program], self._pass_context)
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager([ new_pass("fuse_elewise_add_act"), new_pass("fuse_all_reduce", {"max_memory_size": 1024 * 1024}) ]) pass_manager.apply([main_prog], [startup_prog]) print(pass_manager.names)
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager([ new_pass("build_cinn"), new_pass("fuse_elewise_add_act"), ]) pass_manager.apply([main_prog], [startup_prog]) op_types = [op.type for op in main_prog.global_block().ops] self.assertTrue('cinn_launch' in op_types)
def apply_passes(self, main_prog, startup_prog): self._config["params_grads"] = self._params_grads pass_context = PassContext() auto_parallel_gradient_merge_pass = new_pass( "auto_parallel_gradient_merge_pass", self._config) auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog], pass_context)
def _build_trainer_programs(self): append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs) append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) self.attrs['origin_main_program'] = self.cloned_main if self.launch_barrier and self.launch_barrier_flag: wait_server_ready(self.server_endpoints)
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager([new_pass("fuse_bn_act")]) pass_manager.apply([main_prog], [startup_prog]) print(pass_manager.names) op_type = [] for op in main_prog.global_block().ops: op_type.append(op.type) self.assertTrue("fused_batch_norm_act" in op_type) self.assertTrue("fused_batch_norm_act_grad" in op_type)
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager([new_pass("fuse_relu_depthwise_conv")]) pass_manager.apply([main_prog], [startup_prog]) print(pass_manager.names) op_type = [] for op in main_prog.global_block().ops: if op.type == "depthwise_conv2d": self.assertTrue(op.desc.attr("fuse_relu_before_depthwise_conv")) op_type.append(op.type) self.assertTrue("depthwise_conv2d" in op_type)
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager( [new_pass("inplace_addto_op", {"use_cuda": True})]) pass_manager.apply([main_prog], [startup_prog]) print(pass_manager.names) conv2d_grad_attr = [] for op in main_prog.global_block().ops: if op.type == "conv2d_grad": conv2d_grad_attr.append(op.desc.attr("use_addto")) self.assertTrue(True in conv2d_grad_attr)
def _apply_post_optimization_passes(self, main_program, startup_program, rank, params_grads): if self._dist_strategy.sharding: config = copy.deepcopy(self._dist_strategy.sharding_configs) config["dist_context"] = self._dist_context config["params_grads"] = params_grads config["global_rank"] = rank auto_parallel_sharding_pass = new_pass("auto_parallel_sharding", config) auto_parallel_sharding_pass.apply( [main_program], [startup_program], self._pass_context) if self._dist_strategy.gradient_merge: config = copy.deepcopy(self._dist_strategy.gradient_merge_configs) config["dist_context"] = self._dist_context config["params_grads"] = params_grads auto_parallel_gradient_merge_pass = new_pass( "auto_parallel_gradient_merge_pass", config) auto_parallel_gradient_merge_pass.apply( [main_program], [startup_program], self._pass_context)
def _build_trainer_programs(self): # print("build trainer program entry") # print("before ps program builder program:", self.cloned_main) add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", self.attrs) add_lr_decay_table_pass.apply([], [], self.pass_ctx) # print("before distributed op pass") distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs) distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) delete_optimizer_pass = new_pass("delete_optimizer_pass", self.attrs) delete_optimizer_pass.apply([self.cloned_main], [None], self.pass_ctx) append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs) append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) delete_extra_optimizer_pass = new_pass("delete_extra_optimizer_pass", self.attrs) delete_extra_optimizer_pass.apply([self.attrs['origin_main_program']], [self.cloned_startup], self.pass_ctx) fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs) fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx) self.attrs['origin_main_program'] = self.cloned_main self.attrs['origin_startup_program'] = self.cloned_startup # print("after ps program builder program:", self.cloned_main) if self.launch_barrier and self.launch_barrier_flag: wait_server_ready(self.server_endpoints) return
def apply_passes(self, main_prog, startup_prog): pass_manager = PassManager([new_pass("fuse_optimizer")]) pass_manager.apply([main_prog], [startup_prog]) print(pass_manager.names) op_type = [] for op in main_prog.global_block().ops: op_type.append(op.type) if op.type == "adam": self.assertTrue("@FUSEDVAR@_adam_Param_batch_norm2d_0.b_0" in op.input("Param")) self.assertTrue("@FUSEDVAR@_adam_Grad_batch_norm2d_0.b_0@GRAD" in op.input("Grad")) self.assertTrue("coalesce_tensor" in op_type)
def _build_pserver_programs(self): is_sgd_adam = False ops = get_optimize_ops(self.attrs['origin_main_program']) if len(ops) == 0: return add_lr_decay_table_pass = new_pass('add_lr_decay_table_pass', self.attrs) add_lr_decay_table_pass.apply([], [], self.pass_ctx) for op in ops: if op.type in ["sgd", "adam"]: is_sgd_adam = True break if is_sgd_adam: return
def _build_programs(self): if self.attrs['is_worker'] or self.attrs['is_heter_worker']: self._build_trainer_programs() ps_set_heter_pipeline_opt_pass = new_pass( "set_heter_pipeline_opt_pass", self.attrs) ps_set_heter_pipeline_opt_pass.apply([self.cloned_main], [self.cloned_startup], self.pass_ctx) elif self.attrs['is_server']: self._build_pserver_programs() self.loss.block.program = self.attrs['_main_server'] fluid.framework.switch_startup_program( self.attrs['_startup_server'])
def _build_trainer_programs(self): add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", self.attrs) add_lr_decay_table_pass.apply([], [], self.pass_ctx) distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs) distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) delete_optimizer_pass = new_pass("delete_optimizer_pass", self.attrs) delete_optimizer_pass.apply([self.cloned_main], [None], self.pass_ctx) append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs) append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) delete_extra_optimizer_pass = new_pass("delete_extra_optimizer_pass", self.attrs) delete_extra_optimizer_pass.apply([self.attrs['origin_main_program']], [self.cloned_startup], self.pass_ctx) fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs) fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx) if self.is_heter_worker: split_heter_worker_ops_pass = new_pass( "split_heter_worker_ops_pass", self.attrs) split_heter_worker_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) else: split_trainer_ops_pass = new_pass("split_trainer_ops_pass", self.attrs) split_trainer_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) set_heter_pipeline_opt_pass = new_pass('set_heter_pipeline_opt_pass', self.attrs) set_heter_pipeline_opt_pass.apply([self.cloned_main], [self.cloned_startup], self.pass_ctx) if self.launch_barrier and self.launch_barrier_flag: wait_server_ready(self.server_endpoints) return
def pass_config(self): return [ new_pass("fuse_elewise_add_act"), new_pass("fuse_all_reduce", {"max_memory_size": 1024 * 1024}), ]
def _build_pserver_programs(self): add_listen_and_serv_pass = new_pass('add_listen_and_serv_pass', self.attrs) add_listen_and_serv_pass.apply([self.attrs['_main_server']], [None], self.pass_ctx) return
def _build_trainer_programs(self): _main_file = ps_log_root_dir + '0_fl_worker_main_program.prototxt' #debug_program(_main_file, self.cloned_main) distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs) distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) _main_file = ps_log_root_dir + '1_fl_worker_main_program.prototxt' #debug_program(_main_file, self.cloned_main) delete_optimizer_pass = new_pass("delete_optimizer_pass", self.attrs) delete_optimizer_pass.apply([self.cloned_main], [None], self.pass_ctx) _main_file = ps_log_root_dir + '2_fl_worker_main_program.prototxt' #debug_program(_main_file, self.cloned_main) append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs) append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) _main_file = ps_log_root_dir + '3_fl_worker_main_program.prototxt' #debug_program(_main_file, self.cloned_main) delete_extra_optimizer_pass = new_pass("delete_extra_optimizer_pass", self.attrs) delete_extra_optimizer_pass.apply([self.attrs['origin_main_program']], [self.cloned_startup], self.pass_ctx) _main_file = ps_log_root_dir + '4_fl_worker_main_program.prototxt' #debug_program(_main_file, self.cloned_main) fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs) fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx) _main_file = ps_log_root_dir + '5_fl_worker_main_program.prototxt' #debug_program(_main_file, self.cloned_main) split_trainer_ops_pass = new_pass("split_fl_ops_pass", self.attrs) split_trainer_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) if not self.is_heter_worker: self.part_a_program = self.pass_ctx._attrs['part_a_main_program'] self.cloned_main = self.part_a_program _main_file = ps_log_root_dir + '8_fl_A_main_program.prototxt' debug_program(_main_file, self.cloned_main) else: self.part_b_program = self.pass_ctx._attrs['part_b_main_program'] self.cloned_main = self.part_b_program _main_file = ps_log_root_dir + '8_fl_B_main_program.prototxt' debug_program(_main_file, self.cloned_main) set_heter_pipeline_opt_pass = new_pass('set_heter_pipeline_opt_pass', self.attrs) set_heter_pipeline_opt_pass.apply([self.cloned_main], [self.cloned_startup], self.pass_ctx) self.attrs['origin_startup_program'] = self.cloned_startup self.attrs['origin_main_program'] = self.cloned_main if not self.is_heter_worker: _main_file = ps_log_root_dir + 'final_fl_A_main_program.prototxt' debug_program( _main_file, self.attrs['origin_main_program']. _heter_pipeline_opt['section_program']) else: _main_file = ps_log_root_dir + 'final_fl_B_main_program.prototxt' debug_program( _main_file, self.attrs['origin_main_program']. _heter_pipeline_opt['section_program']) return