def build_role(self, args): if args.role.upper() == "PSERVER": role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) else: role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.WORKER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) return role
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.Adagrad( learning_rate=fluid.layers.exponential_decay( learning_rate=base_lr, decay_steps=500, decay_rate=0.969, staircase=True)) strategy = StrategyFactory.create_async_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def test_fleet_barrier(self): role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.WORKER, worker_num=1, server_endpoints=['127.0.0.1']) fleet.init(role) check_all_trainers_ready("/ready_path/", 0)
def test_pserver(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.geo_sgd_mode = True strategy.geo_sgd_need_push_nums = 5 avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse) optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def run_trainer(self, args): """ run trainer process, you don't need to implement it. Args: args (ArgumentParser): run args to config dist fleet. """ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.WORKER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) self._set_strategy(args) avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, self.strategy) optimizer.minimize(avg_cost) if args.run_params.get("run_from_dataset", False): losses = self.do_training_from_dataset(fleet, args) else: losses = self.do_training(fleet, args) losses = "" if not losses else losses print(losses)
def init_role(): # reset the place according to role of parameter server training_role = os.getenv("TRAINING_ROLE", "TRAINER") paddle_role = role_maker.Role.WORKER place = F.CPUPlace() if training_role == "PSERVER": paddle_role = role_maker.Role.SERVER # set the fleet runtime environment according to configure ports = os.getenv("PADDLE_PORT", "6174").split(",") pserver_ips = os.getenv("PADDLE_PSERVERS").split(",") # ip,ip... eplist = [] if len(ports) > 1: # local debug mode, multi port for port in ports: eplist.append(':'.join([pserver_ips[0], port])) else: # distributed mode, multi ip for ip in pserver_ips: eplist.append(':'.join([ip, ports[0]])) pserver_endpoints = eplist # ip:port,ip:port... worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) role = role_maker.UserDefinedRoleMaker(current_id=trainer_id, role=paddle_role, worker_num=worker_num, server_endpoints=pserver_endpoints) fleet.init(role)
def run_pserver(self, args): if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.sync_mode strategy.geo_sgd_mode = args.geo_sgd_mode strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def test_default_strategy(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) optimizer = fluid.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer)
def test_dist_geo_server_transpiler(self): num_voc = 128 embed_dim = 64 x_shape, x_lod = [16, 10], [[3, 5, 2, 6]] x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1) hash_embd = fluid.contrib.layers.search_pyramid_hash( input=x, num_emb=embed_dim, space_len=num_voc * embed_dim, pyramid_layer=4, rand_len=16, drop_out_percent=0.5, is_training=True, use_filter=False, white_list_len=6400, black_list_len=2800, seed=3, lr=0.002, param_attr=fluid.ParamAttr( name="PyramidHash_emb_0", learning_rate=0, ), param_attr_wl=fluid.ParamAttr( name="Filter", learning_rate=0, ), param_attr_bl=None, distribute_update_vars=["PyramidHash_emb_0"], name=None) cost = fluid.layers.reduce_sum(hash_embd) role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.geo_sgd_mode = True strategy.geo_sgd_need_push_nums = 5 optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def run_ut(self): strategy = StrategyFactory.create_half_async_strategy() training_role = os.getenv("TRAINING_ROLE", "TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER if training_role == "TRAINER" else role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:6002"]) if training_role == "TRAINER": self.run_trainer(role, strategy) else: self.run_pserver(role, strategy)
def test_half_async_strategy(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) half_async_config = DistributeTranspilerConfig() half_async_config.sync_mode = False half_async_config.geo_sgd_mode = False half_async_config.runtime_split_send_recv = False optimizer = fluid.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer, half_async_config)
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.SGD(base_lr) strategy = StrategyFactory.create_geo_strategy(20) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = StrategyFactory.create_async_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def test_init_role(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) # for test optimizer without init(role) # fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.geo_sgd_mode = True strategy.geo_sgd_need_push_nums = 5 avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse) self.assertRaises(Exception, self.set_program, avg_cost, strategy)
def test_transpile(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) # for test optimizer without init(role) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse) self.set_program(avg_cost, strategy) strategy.runtime_split_send_recv = False self.set_program(avg_cost, strategy)
def run_trainer(self, args): """run trainer""" from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet import paddle.fluid as fluid from paddle.fluid.transpiler.ps_dispatcher import RoundRobin from paddle.fluid.transpiler.ps_dispatcher import HashName fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.WORKER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.run_params["sync_mode"] strategy.async_mode = args.run_params["async_mode"] strategy.mode = "pserver" strategy.slice_var_up = args.run_params['slice_var_up'] strategy.enable_dc_asgd = args.run_params['enable_dc_asgd'] if args.run_params['split_method']: strategy.split_method = HashName strategy.split_method = RoundRobin strategy.wait_port = args.run_params['wait_port'] strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv'] strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce'] # strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks'] # strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks'] strategy.geo_sgd_mode = args.run_params['geo_sgd'] strategy.geo_sgd_need_push_nums = args.run_params['push_nums'] avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) losses = self.do_training(fleet, args) losses = "" if not losses else losses print(losses)
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True strategy.wait_port = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def run_pserver(self, args): """ run pserver process, you don't need to implement it. Args: args (ArgumentParser): run args to config dist fleet. """ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) self._set_strategy(args) avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, self.strategy) optimizer.minimize(avg_cost) fleet.init_server(model_dir=args.run_params.get("model_dir", "")) fleet.run_server()
def test_debug_info(self): x = fluid.layers.data(name='x', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) optimizer = fluid.optimizer.SGD(0.0001) strategy = StrategyFactory.create_sync_strategy() strategy.set_debug_opt({ "dump_param": ["fc_0.tmp_0"], "dump_fields": ["fc_0.tmp_0", "fc_0.tmp_0@GRAD"], "dump_fields_path": "dump_text/" }) optimizer = fleet.distributed_optimizer(optimizer, strategy)
def test_communicator_init_and_start(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = DistributeTranspilerConfig() strategy.sync_mode = True strategy.wait_port = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) comm = Communicator(fleet.main_program) comm.start() time.sleep(10) comm.stop()
def test_pserver(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = StrategyFactory.create_geo_strategy(5) avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse) optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def run_trainer(self, args): if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.WORKER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.sync_mode avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) self.do_training(fleet) out = self.do_training(fleet)
def __init__(self): training_role = os.getenv("TRAINING_ROLE", "TRAINER") paddle_role = role_maker.Role.WORKER place = F.CPUPlace() if training_role == "PSERVER": paddle_role = role_maker.Role.SERVER # set the fleet runtime environment according to configure port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = eplist # ip:port,ip:port... worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) role = role_maker.UserDefinedRoleMaker( current_id=trainer_id, role=paddle_role, worker_num=worker_num, server_endpoints=pserver_endpoints) tfleet.init(role) tfleet.save_on_pserver = True
def train(args): datas, avg_cost, predict, train_file_path = model() endpoints = args.endpoints.split(",") if args.role.upper() == "PSERVER": current_id = endpoints.index(args.current_endpoint) else: current_id = 0 role = role_maker.UserDefinedRoleMaker( current_id=current_id, role=role_maker.Role.WORKER if args.role.upper() == "TRAINER" else role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=endpoints) exe = fluid.Executor(fluid.CPUPlace()) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = False optimizer = fluid.optimizer.SGD(learning_rate=0.0001) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) if fleet.is_server(): logger.info("run pserver") fleet.init_server() fleet.run_server() elif fleet.is_worker(): logger.info("run trainer") fleet.init_worker() exe.run(fleet.startup_program) thread_num = 2 filelist = [] for _ in range(thread_num): filelist.append(train_file_path) # config dataset dataset = fluid.DatasetFactory().create_dataset() dataset.set_batch_size(128) dataset.set_use_var(datas) pipe_command = 'python ctr_dataset_reader.py' dataset.set_pipe_command(pipe_command) dataset.set_filelist(filelist) dataset.set_thread(thread_num) for epoch_id in range(10): logger.info("epoch {} start".format(epoch_id)) pass_start = time.time() dataset.set_filelist(filelist) exe.train_from_dataset( program=fleet.main_program, dataset=dataset, fetch_list=[avg_cost], fetch_info=["cost"], print_period=100, debug=False) pass_time = time.time() - pass_start logger.info("epoch {} finished, pass_time {}".format(epoch_id, pass_time)) fleet.stop_worker()
embedding_size = 9 epochs = 1 dataset_prefix = os.environ['DATASET_PATH'] avg_cost, auc_var, _ = ctr_dnn_model(embedding_size, sparse_input_ids, label, sparse_feature_dim) start_date_hr = datetime.strptime(os.environ['START_DATE_HR'], '%Y%m%d/%H') end_date_hr = datetime.strptime(os.environ['END_DATE_HR'], '%Y%m%d/%H') current_date_hr = start_date_hr hdfs_address = os.environ['HDFS_ADDRESS'] hdfs_ugi = os.environ['HDFS_UGI'] start_run_flag = True role = role_maker.UserDefinedRoleMaker( current_id=int(os.getenv("CURRENT_ID")), role=role_maker.Role.WORKER if os.getenv("TRAINING_ROLE") == "TRAINER" else role_maker.Role.SERVER, worker_num=int(os.getenv("PADDLE_TRAINERS_NUM")), server_endpoints=os.getenv("ENDPOINTS").split(",")) role = role_maker.PaddleCloudRoleMaker() fleet.init(role) config = DistributeTranspilerConfig() config.sync_mode = False optimizer = fluid.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer, config) optimizer.minimize(avg_cost) DATE_TIME_STRING_FORMAT = '%Y%m%d/%H' if fleet.is_server(): fleet.init_server() fleet.run_server()
def fit(): role = role_maker.UserDefinedRoleMaker( current_id=current_id, role=role_maker.Role.WORKER if bool(1==int(roles)) else role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011"]) fleet.init(role) BATCH_SIZE = 128 type_size=createDataList(in_file_path,in_file_path+'.data'+"/") # 用于训练的数据提供器 train_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/trainer.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE) test_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/test.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE) data_shape = [3, 32, 32] images = fluid.layers.data(name='images', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 predict = networkConfiguration(images,type_size) # 定义损失函数和准确率 cost = fluid.layers.cross_entropy(input=predict, label=label) # 交叉熵 avg_cost = fluid.layers.mean(cost) # 计算cost中所有元素的平均值 acc = fluid.layers.accuracy(input=predict, label=label) # 使用输入和标签计算准确率 # 定义优化方法 test_program = fluid.default_main_program().clone(for_test=True) # 获取测试程序 optimizer = fluid.optimizer.Adam(learning_rate=0.001) strategy = DistributeTranspilerConfig() strategy.sync_mode = True optimizer = fleet.distributed_optimizer(optimizer,strategy) # 定义优化方法 optimizer.minimize(avg_cost) if fleet.is_server(): print("启动server") fleet.init_server() fleet.run_server() elif fleet.is_worker(): print("启动worker") fleet.init_worker() print(fleet.worker_endpoints()) ########## 模型训练&模型评估 ########## # 创建Executor use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() print("cpu") # 定义数据映射器 feeder = fluid.DataFeeder(feed_list=[images, label], place=place) print("数据映射") exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) for pass_id in range(EPOCH_NUM): print(pass_id) # 开始训练 for batch_id, data in enumerate(train_reader()): # 遍历train_reader train_cost, train_acc = exe.run(program=fluid.default_main_program(), # 运行主程序 feed=feeder.feed(data), # 喂入一个batch的数据 fetch_list=[avg_cost, acc]) # fetch均方误差和准确率 # fetch均方误差和准确率 # 每100次batch打印一次训练、进行一次测试 if batch_id % 20 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %(pass_id, batch_id, train_cost[0], train_acc[0])) # 开始测试 test_costs = [] # 测试的损失值 test_accs = [] # 测试的准确率 for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run(program=test_program, # 执行训练程序 feed=feeder.feed(data), # 喂入数据 fetch_list=[avg_cost, acc]) # fetch误差、准确率 test_costs.append(test_cost[0]) # 记录每个batch的损失值 test_accs.append(test_acc[0]) # 记录每个batch的准确率 test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值 test_acc = (sum(test_accs) / len(test_accs)) # 计算准确率平均值 print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) save(predict,model_file_path,exe) fleet.stop_worker()