예제 #1
0
    def init_distributed_infer_env(self,
                                   exe,
                                   loss,
                                   role_maker=None,
                                   dirname=None):
        import paddle.distributed.fleet as fleet

        if fleet.fleet._runtime_handle is None:
            fleet.init(role_maker=role_maker)

            fake_optimizer = paddle.optimizer.SGD()
            strategy = fleet.DistributedStrategy()
            strategy.a_sync = True
            optimizer = fleet.distributed_optimizer(fake_optimizer,
                                                    strategy=strategy)
            optimizer.minimize(loss,
                               startup_program=self.origin_startup_program)

            if fleet.is_server():
                fleet.init_server(dirname=dirname)
                fleet.run_server()
            else:
                exe.run(paddle.static.default_startup_program())
                fleet.init_worker()
                self._init_dense_params(exe, dirname)
            global_startup_program = paddle.static.default_startup_program()
            global_startup_program = self.origin_startup_program
            global_main_program = paddle.static.default_main_program()
            global_main_program = self.origin_main_program
    def test_gradient_merge_optimizer(self):
        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(
            name="x", shape=[32], dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
        cost = paddle.fluid.layers.cross_entropy(
            input=prediction, label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 6)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
예제 #3
0
def train(args):
    import logging
    log.setLevel(logging.DEBUG)
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    data = load_raw_edges_fn(args.edge_path, args.undirected)
    edges = data[0]
    weights = data[1]
    node2idx = data[2]
    num_nodes = len(node2idx)

    model = DeepwalkModel(num_nodes, args.hidden_size, args.neg_num,
                          args.is_sparse, args.is_distributed, 1.)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    log.info("init_role")
    init_role()

    train_steps = math.ceil(1. * num_nodes * args.epoch /
                            args.batch_size / num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    if args.optimizer == "sgd":
        args.lr *= args.batch_size * args.walk_len * args.win_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        log.info("PS server mode")
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        exe = F.Executor(F.CPUPlace())
        exe.run(F.default_startup_program())
        log.info("Startup done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")


        print("LEO num_nodes:",num_nodes, len(edges))
        edges_feat={}
        edges_feat["weight"] = np.array(weights)
        graph = pgl.graph.Graph(num_nodes, edges, edge_feat=edges_feat)
        # bind gen
        gen_func = build_gen_func(args, graph)

        pyreader.decorate_tensor_provider(gen_func)

        train_prog(exe, F.default_main_program(), loss, pyreader, args, train_steps)
        print("fleet try to stop worker\r\n")
        fleet.stop_worker()
        print("Game over\r\n")
예제 #4
0
    def run_offline_infer(self):
        logger.info("Run Offline Infer Begin")
        place = paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        init_model_path = config.get("runner.init_model_path")
        model_mode = config.get("runner.model_mode", 0)
        if fleet.is_first_worker():
            fleet.load_model(init_model_path, mode=model_mode)
        fleet.barrier_worker()

        logger.info("Prepare Dataset Begin.")
        prepare_data_start_time = time.time()
        dataset = self.wait_and_prepare_dataset()
        prepare_data_end_time = time.time()
        logger.info("Prepare Dataset Done, using time {} second.".format(
            prepare_data_end_time - prepare_data_start_time))

        infer_start_time = time.time()
        self.dataset_offline_infer(dataset)
        infer_end_time = time.time()
        logger.info("Infer Dataset Done, using time {} second.".format(
            infer_end_time - infer_start_time))
예제 #5
0
    def test_gradient_merge_optimizer(self):
        fleet.init(role_maker.PaddleCloudRoleMaker())

        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
        avg_cost = paddle.fluid.layers.mean(cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        strategy.a_sync_configs = {"launch_barrier": False}
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 0)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
예제 #6
0
    def test_communicator_sync(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_PSERVER_NUMS"] = "2"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
            "127.0.0.1:36001,127.0.0.2:36001"

        fleet.init(role_maker.PaddleCloudRoleMaker())
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
예제 #7
0
    def run_worker(self):
        logger.info("Run Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and (not os.path.exists(save_model_path)):
            os.makedirs(save_model_path)

        reader_type = self.config.get("runner.reader_type", None)
        epochs = int(self.config.get("runner.epochs"))
        sync_mode = self.config.get("runner.sync_mode")

        gpus_env = os.getenv("FLAGS_selected_gpus")
        self.PSGPU = paddle.fluid.core.PSGPU()
        gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)]
        print("gpuslot: {}".format(gpuslot))
        self.PSGPU.set_slot_vector(gpuslot)
        self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")])
        opt_info = paddle.fluid.default_main_program()._fleet_opt
        opt_info['stat_var_names'] = []
        for epoch in range(epochs):
            epoch_start_time = time.time()

            if sync_mode == "heter":
                self.heter_train_loop(epoch)
            elif sync_mode == "gpubox":
                self.dataset_train_loop(epoch)
            elif reader_type == "QueueDataset":
                self.dataset_train_loop(epoch)
            elif reader_type == "DataLoader":
                self.dataloader_train_loop(epoch)
            elif reader_type == None or reader_type == "RecDataset":
                self.recdataset_train_loop(epoch)

            epoch_time = time.time() - epoch_start_time
            epoch_speed = self.example_nums / epoch_time
            logger.info(
                "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                    epoch, epoch_time, epoch_speed, self.count_method))
            self.train_result_dict["speed"].append(epoch_speed)

            model_dir = "{}/{}".format(save_model_path, epoch)
            if fleet.is_first_worker(
            ) and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name for feed in self.input_data],
                    self.inference_target_var)
예제 #8
0
    def run_online_worker(self):
        logger.info("Run Online Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and (not os.path.exists(save_model_path)):
            os.makedirs(save_model_path)

        days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ")
        pass_per_day = int(self.config.get("runner.pass_per_day"))

        for day_index in range(len(days)):
            day = days[day_index]
            for pass_index in range(1, pass_per_day + 1):
                logger.info("Day: {} Pass: {} Begin.".format(day, pass_index))
                
                prepare_data_start_time = time.time()
                dataset = self.wait_and_prepare_dataset(day, pass_index)
                prepare_data_end_time = time.time()
                logger.info(
                    "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time))
                
                train_start_time = time.time()
                self.dataset_train_loop(dataset, day, pass_index)
                train_end_time = time.time()
                logger.info(
                    "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time))
            
                model_dir = "{}/{}/{}".format(save_model_path, day, pass_index)

                if fleet.is_first_worker() and save_model_path and is_distributed_env():
                    fleet.save_inference_model(
                        self.exe, model_dir,
                        [feed.name for feed in self.input_data],
                        self.inference_target_var,
                        mode=2)

            if fleet.is_first_worker() and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name for feed in self.input_data],
                    self.inference_target_var,
                    mode=0)
예제 #9
0
    def do_dataset_training(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()

        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        thread_num = 2
        batch_size = 128
        filelist = train_file_list

        # config dataset
        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_use_var(self.feeds)
        dataset.set_batch_size(128)
        dataset.set_thread(2)
        dataset.set_filelist(filelist)
        dataset.set_pipe_command('python ctr_dataset_reader.py')
        dataset.load_into_memory()

        dataset.global_shuffle(fleet, 12)  ##TODO: thread configure
        shuffle_data_size = dataset.get_shuffle_data_size(fleet)
        local_data_size = dataset.get_shuffle_data_size()
        data_size_list = fleet.util.all_gather(local_data_size)
        print('after global_shuffle data_size_list: ', data_size_list)
        print('after global_shuffle data_size: ', shuffle_data_size)

        for epoch_id in range(1):
            pass_start = time.time()
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=dataset,
                                   fetch_list=[self.avg_cost],
                                   fetch_info=["cost"],
                                   print_period=2,
                                   debug=int(os.getenv("Debug", "0")))
            pass_time = time.time() - pass_start
        dataset.release_memory()

        if os.getenv("SAVE_MODEL") == "1":
            model_dir = tempfile.mkdtemp()
            fleet.save_inference_model(exe, model_dir,
                                       [feed.name for feed in self.feeds],
                                       self.avg_cost)
            self.check_model_right(model_dir)
            shutil.rmtree(model_dir)

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)

        cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None)
        if cache_dirname:
            fleet.save_cache_model(cache_dirname)
예제 #10
0
    def run_worker(self):
        logger.info("Run Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open(
                "./{}_worker_main_program.prototxt".format(
                    fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open(
                "./{}_worker_startup_program.prototxt".format(
                    fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and not os.path.exists(save_model_path):
            os.makedirs(save_model_path)

        reader_type = self.config.get("runner.reader_type", None)
        epochs = int(self.config.get("runner.epochs"))
        sync_mode = self.config.get("runner.sync_mode")

        for epoch in range(epochs):
            epoch_start_time = time.time()

            if sync_mode == "heter":
                self.heter_train_loop(epoch)
            elif reader_type == "QueueDataset":
                self.dataset_train_loop(epoch)
            elif reader_type == "DataLoader":
                self.dataloader_train_loop(epoch)
            elif reader_type == None or reader_type == "RecDataset":
                self.recdataset_train_loop(epoch)

            epoch_time = time.time() - epoch_start_time
            epoch_speed = self.example_nums / epoch_time
            logger.info(
                "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                    epoch, epoch_time, epoch_speed, self.count_method))
            self.train_result_dict["speed"].append(epoch_speed)

            model_dir = "{}/{}".format(save_model_path, epoch)
            if fleet.is_first_worker(
            ) and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name
                     for feed in self.input_data], self.inference_target_var)
예제 #11
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()

    fleet.init()

    fake_num_nodes = 1
    py_reader, loss = StaticSkipGramModel(
        fake_num_nodes,
        args.neg_num,
        args.embed_size,
        sparse_embedding=True,
        shared_embedding=args.shared_embedding)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = build_graph(args)
        # bind gen
        train_ds = ShardedDataset(graph.nodes, args.epoch)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(train_ds,
                                 batch_size=args.cpu_batch_size,
                                 shuffle=True,
                                 num_workers=args.sample_workers,
                                 collate_fn=collate_fn)
        py_reader.set_batch_generator(lambda: data_loader)

        train_loss = train(exe, paddle.static.default_main_program(),
                           py_reader, loss)
        fleet.stop_worker()

        if fleet.is_first_worker():
            fleet.save_persistables(exe, "./model",
                                    paddle.static.default_main_program())
예제 #12
0
    def test_ps_minimize(self):
        import paddle
        import paddle.distributed.fleet as fleet

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "1"

        input_x = paddle.fluid.layers.data(
            name="x", shape=[32], dtype='float32')
        input_slot = paddle.fluid.layers.data(
            name="slot", shape=[1], dtype='int64')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        emb = paddle.fluid.layers.embedding(
            input=input_slot, size=[10, 9], is_sparse=True)
        input_x = paddle.concat(x=[input_x, emb], axis=1)
        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
        cost = paddle.fluid.layers.cross_entropy(
            input=prediction, label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        role = fleet.PaddleCloudRoleMaker(is_collective=False)
        fleet.init(role)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        strategy.a_sync_configs = {"launch_barrier": False}

        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(paddle.static.default_startup_program())
        pe = fluid.ParallelExecutor(use_cuda=False, loss_name=avg_cost.name)
        compiled_prog = fluid.compiler.CompiledProgram(
            fluid.default_main_program())

        fleet.init_worker()
        fleet.fleet.save(dirname="/tmp", feed=['x', 'y'], fetch=[avg_cost])
        fleet.fleet.save(
            dirname="/tmp", feed=[input_x, input_y], fetch=[avg_cost])
        fleet.fleet.save(dirname="/tmp")

        fleet.load_model(path="/tmp", mode=0)
        fleet.load_model(path="/tmp", mode=1)
예제 #13
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    if args.num_nodes is None:
        num_nodes = load(args.dataset).num_nodes
    else:
        num_nodes = args.num_nodes

    loss = StaticSkipGramModel(
        num_nodes, args.neg_num, args.embed_size, sparse=True)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = load(args.dataset)
        # bind gen
        train_ds = ShardedDataset(graph.nodes)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(
            train_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        for epoch in range(args.epoch):
            train_loss = train(exe,
                               paddle.static.default_main_program(),
                               data_loader, loss)
            log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
        fleet.stop_worker()
예제 #14
0
    def test_a_sync_optimizer_trainer(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[32],
                                           dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 7)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
예제 #15
0
    def do_pyreader_training(self, fleet):
        """
        do training using dataset, using fetch handler to catch variable
        Args:
            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
        """
        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        batch_size = 4
        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
        self.reader.decorate_sample_list_generator(train_reader)

        for epoch_id in range(1):
            self.reader.start()
            try:
                pass_start = time.time()
                while True:
                    loss_val = exe.run(program=fluid.default_main_program(),
                                       fetch_list=[self.avg_cost.name])
                    loss_val = np.mean(loss_val)
                    # TODO(randomly fail)
                    #   reduce_output = fleet.util.all_reduce(
                    #       np.array(loss_val), mode="sum")
                    #   loss_all_trainer = fleet.util.all_gather(float(loss_val))
                    #   loss_val = float(reduce_output) / len(loss_all_trainer)
                    message = "TRAIN ---> pass: {} loss: {}\n".format(
                        epoch_id, loss_val)
                    fleet.util.print_on_rank(message, 0)

                pass_time = time.time() - pass_start
            except fluid.core.EOFException:
                self.reader.reset()

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)

        model_dir = tempfile.mkdtemp()
        fleet.save_inference_model(exe, model_dir,
                                   [feed.name for feed in self.feeds],
                                   self.avg_cost)
        self.check_model_right(model_dir)
        shutil.rmtree(model_dir)
예제 #16
0
    def do_dataset_training_queuedataset(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()

        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        thread_num = 2
        batch_size = 128
        filelist = train_file_list

        # config dataset
        dataset = paddle.distributed.QueueDataset()
        pipe_command = 'python ctr_dataset_reader.py'

        dataset.init(batch_size=batch_size,
                     use_var=self.feeds,
                     pipe_command=pipe_command,
                     thread_num=thread_num)

        dataset.set_filelist(filelist)

        for epoch_id in range(1):
            pass_start = time.time()
            dataset.set_filelist(filelist)
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=dataset,
                                   fetch_list=[self.avg_cost],
                                   fetch_info=["cost"],
                                   print_period=2,
                                   debug=int(os.getenv("Debug", "0")))
            pass_time = time.time() - pass_start

        if os.getenv("SAVE_MODEL") == "1":
            model_dir = tempfile.mkdtemp()
            fleet.save_inference_model(exe, model_dir,
                                       [feed.name for feed in self.feeds],
                                       self.avg_cost)
            self.check_model_right(model_dir)
            shutil.rmtree(model_dir)

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)
    def run_trainer(self, role, strategy):
        place = fluid.core.CPUPlace()
        exe = fluid.Executor(place)

        fleet.init(role)
        avg_cost, x, z, y = self.net()
        optimizer = fluid.optimizer.SGD(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        train_reader = paddle.batch(self.fake_reader(), batch_size=24)
        feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])

        for batch_id, data in enumerate(train_reader()):
            exe.run(fluid.default_main_program(),
                    feed=feeder.feed(data),
                    fetch_list=[])

        fleet.stop_worker()
    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"launch_barrier": False}

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        os.environ["TEST_MODE"] = "1"
        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
예제 #19
0
    def run_online_worker(self):
        logger.info("Run Online Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        self.online_intervals = get_online_pass_interval(
            self.split_interval, self.split_per_pass, False)
        if is_local(self.save_model_path) and self.save_model_path and (
                not os.path.exists(self.save_model_path)):
            os.makedirs(self.save_model_path)

        last_day, last_pass, last_path, xbox_base_key = get_last_save_model(
            self.save_model_path, self.hadoop_client)
        logger.info(
            "get_last_save_model last_day = {}, last_pass = {}, last_path = {}, xbox_base_key = {}".
            format(last_day, last_pass, last_path, xbox_base_key))
        if last_day != -1 and fleet.is_first_worker():
            load_model(last_path, 0, self.hadoop_client)
        fleet.barrier_worker()

        day = self.start_day
        infer_first = True
        while int(day) <= int(self.end_day):
            logger.info("training a new day {}, end_day = {}".format(
                day, self.end_day))
            if last_day != -1 and int(day) < last_day:
                day = get_next_day(day)
                continue
            # base_model_saved = False
            for pass_id in range(1, 1 + len(self.online_intervals)):
                print(last_day, day, last_pass, pass_id)
                if (last_day != -1 and int(day) == last_day) and (
                        last_pass != -1 and int(pass_id) <= last_pass):
                    continue
                if self.save_first_base and fleet.is_first_worker():
                    self.save_first_base = False
                    last_base_day, last_base_path, tmp_xbox_base_key = \
                        get_last_save_xbox_base(self.save_model_path, self.hadoop_client)
                    logger.info(
                        "get_last_save_xbox_base, last_base_day = {}, last_base_path = {}, tmp_xbox_base_key = {}".
                        format(last_base_day, last_base_path,
                               tmp_xbox_base_key))
                    if int(day) > last_base_day:
                        xbox_base_key = int(time.time())
                        save_xbox_model(self.save_model_path, day, -1,
                                        self.exe, self.inference_feed_vars,
                                        self.inference_target_var,
                                        self.hadoop_client)
                        write_xbox_donefile(
                            output_path=self.save_model_path,
                            day=day,
                            pass_id=-1,
                            xbox_base_key=xbox_base_key,
                            client=self.hadoop_client)
                    elif int(day) == last_base_day:
                        xbox_base_key = tmp_xbox_base_key
                fleet.barrier_worker()

                logger.info("training a new day = {} new pass = {}".format(
                    day, pass_id))
                logger.info("Day:{}, Pass: {}, Prepare Dataset Begin.".format(
                    day, pass_id))
                begin_train = time.time()
                begin = time.time()
                dataset = self.wait_and_prepare_dataset(day, pass_id)
                end = time.time()
                read_data_cost = (end - begin) / 60.0
                logger.info("Prepare Dataset Done, using time {} mins.".format(
                    read_data_cost))

                infer_cost = 0
                infer_metric_cost = 0
                if infer_first:
                    infer_first = False
                else:
                    logger.info("Day:{}, Pass: {}, Infering Dataset Begin.".
                                format(day, pass_id))
                    begin = time.time()
                    self.dataset_infer_loop(dataset, day, pass_id)
                    end = time.time()
                    infer_cost = (end - begin) / 60.0
                    logger.info("Infering Dataset Done, using time {} mins.".
                                format(infer_cost))
                    begin = time.time()
                    metric_str = get_global_metrics_str(fluid.global_scope(),
                                                        self.metric_list, "")
                    logger.info("Day:{}, Pass: {}, Infer Global Metric: {}".
                                format(day, pass_id, metric_str))
                    clear_metrics(fluid.global_scope(), self.metric_list,
                                  self.metric_types)
                    end = time.time()
                    infer_metric_cost = (end - begin) / 60.0

                logger.info("Day:{}, Pass: {}, Training Dataset Begin.".format(
                    day, pass_id))
                begin = time.time()
                self.dataset_train_loop(dataset, day, pass_id,
                                        self.need_train_dump)
                end = time.time()
                avg_cost = get_avg_cost_mins(end - begin)
                get_max_cost_mins(end - begin)
                get_min_cost_mins(end - begin)
                train_cost = avg_cost
                logger.info("Training Dataset Done, using time {} mins.".
                            format(train_cost))

                begin = time.time()
                dataset.release_memory()
                end = time.time()
                release_cost = (end - begin) / 60.0

                begin = time.time()
                metric_str = get_global_metrics_str(fluid.global_scope(),
                                                    self.metric_list, "")
                logger.info("Day:{}, Pass: {}, Train Global Metric: {}".format(
                    day, pass_id, metric_str))
                clear_metrics(fluid.global_scope(), self.metric_list,
                              self.metric_types)
                end = time.time()
                metric_cost = (end - begin) / 60
                end_train = time.time()
                total_cost = (end_train - begin_train) / 60
                other_cost = total_cost - read_data_cost - train_cost - release_cost - metric_cost - infer_cost - infer_metric_cost
                log_str = "finished train epoch %d time cost:%s min job time cost" \
                            ":[read_data:%s min][train: %s min][metric: %s min][release: %s min]" \
                            "[infer:%s min][infer_metric: %s min][other:%s min]" \
                              % (pass_id, total_cost, read_data_cost, train_cost, metric_cost, release_cost, infer_cost, infer_metric_cost, other_cost)
                logger.info(log_str)

                if self.need_infer_dump:
                    prepare_data_start_time = time.time()
                    dump_dataset = self.wait_and_prepare_infer_dataset(day,
                                                                       pass_id)
                    prepare_data_end_time = time.time()
                    logger.info(
                        "Prepare Infer Dump Dataset Done, using time {} second.".
                        format(prepare_data_end_time -
                               prepare_data_start_time))

                    dump_start_time = time.time()
                    self.dataset_infer_loop(dump_dataset, day, pass_id, True)
                    dump_end_time = time.time()
                    logger.info(
                        "Infer Dump Dataset Done, using time {} second.".
                        format(dump_end_time - dump_start_time))

                    dump_dataset.release_memory()

                if fleet.is_first_worker():
                    if pass_id % self.checkpoint_per_pass == 0:
                        save_model(self.exe, self.save_model_path, day,
                                   pass_id)
                        write_model_donefile(
                            output_path=self.save_model_path,
                            day=day,
                            pass_id=pass_id,
                            xbox_base_key=xbox_base_key,
                            client=self.hadoop_client)
                    if pass_id % self.save_delta_frequency == 0:
                        last_xbox_day, last_xbox_pass, last_xbox_path, _ = get_last_save_xbox(
                            self.save_model_path, self.hadoop_client)
                        if int(day) < last_xbox_day or int(
                                day) == last_xbox_day and int(
                                    pass_id) <= last_xbox_pass:
                            log_str = "delta model exists"
                            logger.info(log_str)
                        else:
                            save_xbox_model(self.save_model_path, day, pass_id,
                                            self.exe, self.inference_feed_vars,
                                            self.inference_target_var,
                                            self.hadoop_client)  # 1 delta
                            write_xbox_donefile(
                                output_path=self.save_model_path,
                                day=day,
                                pass_id=pass_id,
                                xbox_base_key=xbox_base_key,
                                client=self.hadoop_client,
                                hadoop_fs_name=self.hadoop_fs_name,
                                monitor_data=metric_str)
                fleet.barrier_worker()

            logger.info("shrink table")
            begin = time.time()
            fleet.shrink()
            end = time.time()
            logger.info("shrink table done, cost %s min" % (
                (end - begin) / 60.0))

            if fleet.is_first_worker():
                last_base_day, last_base_path, last_base_key = get_last_save_xbox_base(
                    self.save_model_path, self.hadoop_client)
                logger.info(
                    "one epoch finishes, get_last_save_xbox, last_base_day = {}, last_base_path = {}, last_base_key = {}".
                    format(last_base_day, last_base_path, last_base_key))
                next_day = get_next_day(day)
                if int(next_day) <= last_base_day:
                    logger.info("batch model/base xbox model exists")
                else:
                    xbox_base_key = int(time.time())
                    save_xbox_model(self.save_model_path, next_day, -1,
                                    self.exe, self.inference_feed_vars,
                                    self.inference_target_var,
                                    self.hadoop_client)
                    write_xbox_donefile(
                        output_path=self.save_model_path,
                        day=next_day,
                        pass_id=-1,
                        xbox_base_key=xbox_base_key,
                        client=self.hadoop_client,
                        hadoop_fs_name=self.hadoop_fs_name,
                        monitor_data=metric_str)
                    save_batch_model(self.exe, self.save_model_path, next_day)
                    write_model_donefile(
                        output_path=self.save_model_path,
                        day=next_day,
                        pass_id=-1,
                        xbox_base_key=xbox_base_key,
                        client=self.hadoop_client)
            fleet.barrier_worker()
            day = get_next_day(day)
예제 #20
0
def fit():
    EPOCH_NUM = 3
    BATCH_SIZE = 128
    type_size = 10
    role = role_maker.UserDefinedRoleMaker(
        current_id=0,
        role=role_maker.Role.SERVER,
        worker_num=2,
        server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

    fleet.init(role)
    strategy = paddle.distributed.fleet.DistributedStrategy()
    strategy.a_sync = True

    type_size = createDataList('F:/机器学习/CNN/train',
                               'D:/cnn/cnn.model.data' + "/")
    # 用于训练的数据提供器
    train_reader = dataReader("D:/cnn/cnn.model.data/trainer.list")
    train_reader = paddle.batch(paddle.reader.shuffle(reader=train_reader,
                                                      buf_size=BATCH_SIZE *
                                                      100),
                                batch_size=BATCH_SIZE)
    test_reader = dataReader("D:/cnn/cnn.model.data/test.list")
    test_reader = paddle.batch(paddle.reader.shuffle(reader=test_reader,
                                                     buf_size=BATCH_SIZE *
                                                     100),
                               batch_size=BATCH_SIZE)
    data_shape = [3, 32, 32]
    paddle.enable_static()
    images = fluid.layers.data(name='images',
                               shape=data_shape,
                               dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # 获取分类器
    predict = networkConfiguration(images, type_size)

    # 定义损失函数和准确率
    cost = fluid.layers.cross_entropy(input=predict, label=label)  # 交叉熵
    avg_cost = fluid.layers.mean(cost)  # 计算cost中所有元素的平均值
    acc = fluid.layers.accuracy(input=predict, label=label)  # 使用输入和标签计算准确率
    # 定义优化方法
    test_program = fluid.default_main_program().clone(for_test=True)  # 获取测试程序
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)  # 定义优化方法
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        fleet.init_worker()
        ########## 模型训练&模型评估 ##########
        # 创建Executor
        use_cuda = False  # 定义使用CPU还是GPU,使用CPU时use_cuda=False
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
    print("完成")

    # 定义数据映射器
    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
    for pass_id in range(EPOCH_NUM):
        # 开始训练
        for batch_id, data in enumerate(train_reader()):  # 遍历train_reader
            train_cost, train_acc = exe.run(
                program=fluid.default_main_program(),  # 运行主程序
                feed=feeder.feed(data),  # 喂入一个batch的数据
                fetch_list=[avg_cost, acc])  # fetch均方误差和准确率
            # 每100次batch打印一次训练、进行一次测试
            if batch_id % 20 == 0:
                print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %
                      (pass_id, batch_id, train_cost[0], train_acc[0]))
        # 开始测试
        test_costs = []  # 测试的损失值
        test_accs = []  # 测试的准确率
        for batch_id, data in enumerate(test_reader()):
            test_cost, test_acc = exe.run(
                program=test_program,  # 执行训练程序
                feed=feeder.feed(data),  # 喂入数据
                fetch_list=[avg_cost, acc])  # fetch误差、准确率
            test_costs.append(test_cost[0])  # 记录每个batch的损失值
            test_accs.append(test_acc[0])  # 记录每个batch的准确率

        test_cost = (sum(test_costs) / len(test_costs))  # 计算误差平均值
        test_acc = (sum(test_accs) / len(test_accs))  # 计算准确率平均值
        print('Test:%d, Cost:%0.5f, ACC:%0.5f' %
              (pass_id, test_cost, test_acc))
    save(predict, "D:/cnn/cnn.model", exe)
예제 #21
0
# limitations under the License.

import os
import fleetx as X
import paddle.fluid as fluid
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker

configs = X.parse_train_configs()
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
model = X.applications.MultiSlotCTR()
loader = model.load_multislot_from_file('./ctr_data/train_data')

dist_strategy = fleet.DistributedStrategy()
dist_strategy.a_sync = False
dist_strategy.a_sync = True

optimizer = fluid.optimizer.SGD(learning_rate=0.0001)

optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
optimizer.minimize(model.loss)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
else:
    fleet.init_worker()
    trainer = X.Trainer(fluid.CPUPlace())
    trainer.fit(model, loader, epoch=10)
예제 #22
0
def test_init_worker():
    """test_barrier_worker"""
    assert fleet.init_worker() is None
    print("{} ... ok".format(sys._getframe().f_code.co_name))
예제 #23
0
 def test_init_worker(self):
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     if fleet.is_worker():
         fleet.init_worker()
예제 #24
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)
    data = pgl.dataset.RedditDataset(args.normalize, args.symmetry)
    log.info("Preprocess finish")
    log.info("Train Examples: %s" % len(data.train_index))
    log.info("Val Examples: %s" % len(data.val_index))
    log.info("Test Examples: %s" % len(data.test_index))
    log.info("Num nodes %s" % data.graph.num_nodes)
    log.info("Num edges %s" % data.graph.num_edges)
    log.info("Average Degree %s" % np.mean(data.graph.indegree()))

    graph = data.graph
    train_index = data.train_index
    val_index = data.val_index
    test_index = data.test_index

    train_label = data.train_label
    val_label = data.val_label
    test_label = data.test_label

    loss, acc = build_net(
        input_size=data.feature.shape[-1],
        num_class=data.num_classes,
        hidden_size=args.hidden_size,
        num_layers=len(args.samples))
    test_program = paddle.static.default_main_program().clone(for_test=True)

    strategy = fleet.DistributedStrategy()
    strategy.a_sync = True
    optimizer = paddle.fluid.optimizer.Adam(learning_rate=args.lr)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(loss)

    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()
    else:
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        train_ds = ShardedDataset(train_index, train_label)
        valid_ds = ShardedDataset(val_index, val_label)
        test_ds = ShardedDataset(test_index, test_label)

        collate_fn = partial(batch_fn, graph=graph, samples=args.samples)

        train_loader = Dataloader(
            train_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        valid_loader = Dataloader(
            valid_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        test_loader = Dataloader(
            test_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        compiled_prog, cpu_num = setup_compiled_prog(loss)

        for epoch in tqdm.tqdm(range(args.epoch)):
            train_loss, train_acc = run(train_loader,
                                        data.feature,
                                        exe,
                                        compiled_prog,
                                        loss,
                                        acc,
                                        phase="train",
                                        cpu_num=cpu_num)

            valid_loss, valid_acc = run(valid_loader,
                                        data.feature,
                                        exe,
                                        test_program,
                                        loss,
                                        acc,
                                        phase="valid",
                                        cpu_num=1)

            log.info("Epoch %s Valid-Loss %s Valid-Acc %s" %
                     (epoch, valid_loss, valid_acc))
        test_loss, test_acc = run(test_loader,
                                  data.feature,
                                  exe,
                                  test_program,
                                  loss,
                                  acc,
                                  phase="test",
                                  cpu_num=1)
        log.info("Epoch %s Test-Loss %s Test-Acc %s" %
                 (epoch, test_loss, test_acc))

        fleet.stop_worker()
예제 #25
0
    def test_communicator_ps_gpu(self):
        with open("test_communicator_ps_gpu.txt", "w") as f:
            data = "1 0.6 1 0.7\n"
            f.write(data)

        os.environ["PADDLE_PSERVER_NUMS"] = "2"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ[
            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
        os.environ[
            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["FLAGS_selected_gpus"] = "0"
        role = role_maker.PaddleCloudRoleMaker()

        fleet.init(role)
        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
        slots_vars = [x, y]

        cost = fluid.layers.square_error_cost(input=x, label=y)
        avg_cost = fluid.layers.mean(cost)

        optimizer = fluid.optimizer.Adam(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {
            "launch_barrier": False,
            "use_ps_gpu": 1,
        }
        startup_program = paddle.static.Program()
        main_program = paddle.static.Program()
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        dataset = paddle.distributed.InMemoryDataset()
        dataset.init(batch_size=32,
                     thread_num=1,
                     pipe_command="cat",
                     use_var=slots_vars)
        dataset.set_filelist(["test_communicator_ps_gpu.txt"])
        dataset.set_date("20211111")
        dataset.load_into_memory(is_shuffle=True)

        os.environ["TEST_MODE"] = "1"
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(startup_program)
        main_program._fleet_opt = {"stat_var_names": [x.name]}
        fleet.init_worker()

        try:
            exe.train_from_dataset(main_program, dataset)
        except ImportError as e:
            pass
        except Exception as e:
            self.assertTrue(False)
        time.sleep(10)
        fleet.stop_worker()
        os.remove("./test_communicator_ps_gpu.txt")
예제 #26
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()

    fleet.init()

    if args.num_nodes is None:
        num_nodes = load(args.dataset).num_nodes
    else:
        num_nodes = args.num_nodes

    loss = StaticSkipGramModel(num_nodes,
                               args.neg_num,
                               args.embed_size,
                               sparse=True)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = load(args.dataset)
        # bind gen
        train_ds = ShardedDataset(graph.nodes)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(train_ds,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 num_workers=args.sample_workers,
                                 collate_fn=collate_fn)

        cpu_num = int(os.environ.get('CPU_NUM', 1))
        if int(cpu_num) > 1:
            parallel_places = [paddle.CPUPlace()] * cpu_num
            exec_strategy = paddle.static.ExecutionStrategy()
            exec_strategy.num_threads = int(cpu_num)
            build_strategy = paddle.static.BuildStrategy()
            build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce
            compiled_prog = paddle.static.CompiledProgram(
                paddle.static.default_main_program()).with_data_parallel(
                    loss_name=loss.name,
                    places=parallel_places,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)
        else:
            compiled_prog = paddle.static.default_main_program()

        for epoch in range(args.epoch):
            train_loss = train(exe, compiled_prog, data_loader, loss)
            log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
        fleet.stop_worker()

        if fleet.is_first_worker():
            fleet.save_persistables(exe, "./model",
                                    paddle.static.default_main_program())