예제 #1
0
    def test_pslib_1(self):
        """Test cases for pslib."""
        import paddle.fluid as fluid
        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_pslib_1")
            return
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        role_maker = GeneralRoleMaker()
        role_maker.generate_role()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        fleet.init(role_maker)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        with fluid.program_guard(train_program, startup_program):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            emb = fluid.layers.embedding(input=show, size=[1, 1], \
                is_sparse=True, is_distributed=True, \
                param_attr=fluid.ParamAttr(name="embedding"))
            fc = fluid.layers.fc(input=emb, size=1, act=None)
            label = fluid.layers.data(name="click", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            label_cast = fluid.layers.cast(label, dtype='float32')
            cost = fluid.layers.log_loss(fc, label_cast)

        strategy = {}
        strategy["embedding"] = {}
        strategy["embedding"]["sparse_accessor_class"] = "DownpourUnitAccessor"
        strategy["embedding"]["embed_sparse_optimizer"] = "naive"
        try:
            adam1 = fluid.optimizer.Adam(learning_rate=0.000005)
            adam1 = fleet.distributed_optimizer(adam1, strategy=strategy)
            adam1.minimize([cost], [scope])

            strategy["embedding"]["embed_sparse_optimizer"] = "adagrad"
            adam2 = fluid.optimizer.Adam(learning_rate=0.000005)
            adam2 = fleet.distributed_optimizer(adam2, strategy=strategy)
            adam2.minimize([cost], [scope])

            strategy["embedding"]["embed_sparse_optimizer"] = "adam"
            adam3 = fluid.optimizer.Adam(learning_rate=0.000005)
            adam3 = fleet.distributed_optimizer(adam3, strategy=strategy)
            adam3.minimize([cost], [scope])
        except:
            print("do not support pslib test, skip")
            return
    def test_pslib_1(self):
        """Test cases for pslib."""
        import paddle.fluid as fluid
        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker

        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        role_maker = GeneralRoleMaker(
            init_timeout_seconds=100,
            run_timeout_seconds=100,
            http_ip_port="127.0.0.1:36003")
        #role_maker.generate_role()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        #fleet.init(role_maker)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        with fluid.program_guard(train_program, startup_program):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                dtype="float32", lod_level=1, append_batch_size=False)
            fc = fluid.layers.fc(input=show, size=1, act=None)
            label = fluid.layers.data(name="click", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            label_cast = fluid.layers.cast(label, dtype='float32')
            cost = fluid.layers.log_loss(fc, label_cast)
        try:
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            adam = fleet.distributed_optimizer(adam)
            adam.minimize([cost], [scope])
            fleet.run_server()
            http_server_d = {}
            http_server_d["running"] = False
            size_d = {}
            role_maker._GeneralRoleMaker__start_kv_server(http_server_d, size_d)
        except:
            print("do not support pslib test, skip")
            return

        from paddle.fluid.incubate.fleet.base.role_maker import MockBarrier
        mb = MockBarrier()
        mb.barrier()
        mb.barrier_all()
        mb.all_reduce(1)
        mb.all_gather(1)
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36005"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36005"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36006"
        os.environ["PADDLE_IS_BARRIER_ALL_ROLE"] = "0"
        role_maker = GeneralRoleMaker(path="test_mock1")
        role_maker.generate_role()
예제 #3
0
 def build_optimizer(self, params):
     """R
     """
     optimizer_conf = params['optimizer_conf']
     strategy = None
     if 'strategy' in optimizer_conf:
         strategy = optimizer_conf['strategy']
         stat_var_names = []
         metrics = params['metrics']
         for name in metrics:
             model_metrics = metrics[name]
             stat_var_names += [
                 model_metrics[metric]['var'].name
                 for metric in model_metrics
             ]
         strategy['stat_var_names'] = list(set(stat_var_names))
     optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \
                           '(learning_rate=' + str(optimizer_conf['learning_rate']) + ')'
     exec(optimizer_generator)
     optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
     return optimizer
예제 #4
0
    def init(self, context):
        """R
        """
        self.model.train_net()
        optimizer = self.model.optimizer()

        optimizer = fleet.distributed_optimizer(optimizer,
                                                strategy={"use_cvm": False})
        optimizer.minimize(self.model.get_avg_cost())

        if fleet.is_server():
            context['status'] = 'server_pass'
        else:
            self.fetch_vars = []
            self.fetch_alias = []
            self.fetch_period = self.model.get_fetch_period()

            metrics = self.model.get_metrics()
            if metrics:
                self.fetch_vars = metrics.values()
                self.fetch_alias = metrics.keys()
            context['status'] = 'train_pass'
예제 #5
0
    def test_dataset_fleet2(self):
        """
        Testcase for InMemoryDataset from create to run.
        """
        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
            f.write(data)
        with open("test_in_memory_dataset2_run2_b.txt", "w") as f:
            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
            f.write(data)

        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        with fluid.program_guard(train_program, startup_program):
            slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
            slots_vars = []
            for slot in slots:
                var = fluid.layers.data(\
                    name=slot, shape=[1], dtype="float32", lod_level=1)
                slots_vars.append(var)
            fake_cost = \
                fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
            fake_cost = fluid.layers.mean(fake_cost)
        with fluid.scope_guard(scope):
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            try:
                fleet.init()
            except ImportError as e:
                print("warning: no mpi4py")
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            try:
                adam = fleet.distributed_optimizer(
                    adam,
                    strategy={
                        "fs_uri": "fs_uri_xxx",
                        "fs_user": "******",
                        "fs_passwd": "fs_passwd_xxx",
                        "fs_hadoop_bin": "fs_hadoop_bin_xxx"
                    })
                adam.minimize([fake_cost], [scope])
            except AttributeError as e:
                print("warning: no mpi")
            except ImportError as e:
                print("warning: no mpi4py")
            exe.run(startup_program)
            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist([
                "test_in_memory_dataset2_run2_a.txt",
                "test_in_memory_dataset2_run2_b.txt"
            ])
            dataset.set_pipe_command("cat")
            dataset.set_use_var(slots_vars)
            dataset.load_into_memory()
            try:
                dataset.global_shuffle(fleet)
            except:
                print("warning: catch expected error")
            fleet._opt_info = None
            fleet._fleet_ptr = None

        os.remove("./test_in_memory_dataset2_run2_a.txt")
        os.remove("./test_in_memory_dataset2_run2_b.txt")
예제 #6
0
    def test_pslib_1(self):
        """Test cases for pslib."""
        import paddle.fluid as fluid
        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker

        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        role_maker = GeneralRoleMaker()
        #role_maker.generate_role()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        #fleet.init(role_maker)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        with fluid.program_guard(train_program, startup_program):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            emb = fluid.layers.embedding(input=show, size=[1, 1], \
                is_sparse=True, is_distributed=True, \
                param_attr=fluid.ParamAttr(name="embedding"))
            bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
            bow = fluid.layers.data_norm(input=bow, epsilon=1e-4, name="norm")
            fc = fluid.layers.fc(input=bow, size=1, act=None)
            label = fluid.layers.data(name="click", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            label_cast = fluid.layers.cast(label, dtype='float32')
            cost = fluid.layers.log_loss(fc, label_cast)
        try:
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            adam = fleet.distributed_optimizer(
                adam,
                strategy={
                    "embedding": {
                        "sparse_accessor_class": "DownpourSparseValueAccessor"
                    }
                })
            adam.minimize([cost], [scope])
            fleet.run_server()
        except:
            print("do not support pslib test, skip")
            return
        try:
            # worker should call these methods instead of server
            # the following is only for test when with_pslib=off
            def test_func():
                """
                it is only a test function
                """
                return True

            fleet._role_maker.is_first_worker = test_func
            fleet._role_maker._barrier_worker = test_func
            fleet.save_model("./model_000")
            fleet.save_one_table(0, "./model_001")
            fleet.save_one_table(0, "./model_002", prefix="hahaha")
            fleet.load_model("./model_0003")
            fleet.load_one_table(0, "./model_004")
        except:
            print("do not support pslib test, skip")
            return
예제 #7
0
 def test_pslib_1(self):
     """Test cases for pslib."""
     import paddle.fluid as fluid
     from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
     from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
     from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
     try:
         import netifaces
     except:
         print("warning: no netifaces, skip test_pslib_1")
         return
     os.environ["POD_IP"] = "127.0.0.1"
     os.environ["PADDLE_PORT"] = "36001"
     os.environ["TRAINING_ROLE"] = "TRAINER"
     os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
     os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
     os.environ["PADDLE_TRAINER_ID"] = "0"
     role_maker = GeneralRoleMaker()
     role_maker.generate_role()
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     fleet.init(role_maker)
     train_program = fluid.Program()
     startup_program = fluid.Program()
     scope = fluid.Scope()
     with fluid.program_guard(train_program, startup_program):
         show = fluid.layers.data(name="show", shape=[-1, 1], \
             dtype="float32", lod_level=1, append_batch_size=False)
         fc = fluid.layers.fc(input=show, size=1, act=None)
         label = fluid.layers.data(name="click", shape=[-1, 1], \
             dtype="int64", lod_level=1, append_batch_size=False)
         label_cast = fluid.layers.cast(label, dtype='float32')
         cost = fluid.layers.log_loss(fc, label_cast)
     try:
         adam = fluid.optimizer.Adam(learning_rate=0.000005)
         adam = fleet.distributed_optimizer(adam)
         adam.minimize([cost], [scope])
         fleet.run_server()
     except:
         print("do not support pslib test, skip")
         return
     fleet.clear_one_table(0)
     from paddle.fluid.incubate.fleet.base.role_maker import \
         MPISymetricRoleMaker
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2])
     except:
         print("catch expected error of not inited")
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2], "min")
     except:
         print("catch expected error of not inited")
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2], "max")
     except:
         print("catch expected error of not inited")
     try:
         role = MPISymetricRoleMaker()
         role._all_reduce([1], [2], "unknown")
     except:
         print("catch expected error of unknown type")
예제 #8
0
    def test_pslib_2(self):
        """Test cases for pslib."""
        import paddle.fluid as fluid
        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
        from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_pslib_2")
            return
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "1"
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        try:
            fleet.init(None)
        except:
            print("no mpi4py, skip test_pslib_2")
            return
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        with fluid.program_guard(train_program, startup_program):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                dtype="float32", lod_level=1, append_batch_size=False)
            fc = fluid.layers.fc(input=show, size=1, act=None)
            label = fluid.layers.data(name="click", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            label_cast = fluid.layers.cast(label, dtype='float32')
            cost = fluid.layers.log_loss(fc, label_cast)
        try:
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            adam = fleet.distributed_optimizer(adam)
            adam.minimize([cost], [scope])
            fleet.run_server()
        except:
            print("do not support pslib test, skip")
            return
        os.environ["TRAINING_ROLE"] = "wrong"
        try:
            role1 = GeneralRoleMaker(path="./test_gloo_1")
            role1.generate_role()
        except:
            print("catch expected error of wrong TRAINING_ROLE")
        os.environ["TRAINING_ROLE"] = "PSERVER"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
        role2 = GeneralRoleMaker(path="./test_gloo_2")
        role2._finalize()
        role2._all_gather(1)
        role2._all_gather(1)
        role2._barrier_server()
        role2.all_gather(1)
        role3 = GeneralRoleMaker(path="./test_gloo_3")
        role3._worker_gather(1)
        role3._worker_gather(1)
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        role4 = GeneralRoleMaker(path="./test_gloo_4")
        role4._worker_gather(1)
        role4._get_rank()
        role4._get_size()
        role4._all_comm.init()
        role5 = GeneralRoleMaker(path="./test_gloo_5")
        role5.get_local_endpoint()
        role5.get_local_endpoint()
        role6 = GeneralRoleMaker(path="./test_gloo_6")
        role6.get_trainer_endpoints()
        role6.get_trainer_endpoints()
        role7 = GeneralRoleMaker(path="./test_gloo_7")
        role7.get_pserver_endpoints()
        role7.get_pserver_endpoints()
        role8 = GeneralRoleMaker(path="./test_gloo_8")
        role8.is_worker()
        role8.is_worker()
        role9 = GeneralRoleMaker(path="./test_gloo_9")
        role9.is_server()
        role9.is_server()
        role10 = GeneralRoleMaker(path="./test_gloo_10")
        role10.is_first_worker()
        role10.is_first_worker()
        role11 = GeneralRoleMaker(path="./test_gloo_11")
        role11.worker_index()
        role11.worker_index()
        role12 = GeneralRoleMaker(path="./test_gloo_12")
        role12.server_index()
        role12.server_index()
        role13 = GeneralRoleMaker(path="./test_gloo_13")
        role13.worker_num()
        role13.worker_num()
        role14 = GeneralRoleMaker(path="./test_gloo_14")
        role14.server_num()
        role14.server_num()
        role15 = GeneralRoleMaker(path="./test_gloo_15")
        role15._barrier_worker()
        role15._barrier_worker()
        role16 = GeneralRoleMaker(path="./test_gloo_16")
        role16._barrier_all()
        role16._barrier_all()
        role17 = GeneralRoleMaker(path="./test_gloo_17")
        role17._barrier_server()
        role17._barrier_server()
        role18 = GeneralRoleMaker(path="./test_gloo_18")
        role18._worker_num()
        role18._worker_num()
        role19 = GeneralRoleMaker(path="./test_gloo_19")
        role19._server_num()
        role19._server_num()
        role20 = GeneralRoleMaker(path="./test_gloo_20")
        a = [1]
        b = [0]
        role20._all_reduce(a, b)
        role21 = GeneralRoleMaker(path="./test_gloo_21")
        role21.all_reduce_worker([], [])
        role21.all_reduce_worker([], [])
        role21.barrier_worker()
        role21.barrier_all()
        role22 = GeneralRoleMaker(path="./test_gloo_22")
        role22._get_rank()
        role22._get_rank()
        os.environ["PADDLE_PSERVER_ID"] = "0"
        role23 = GeneralRoleMaker(path="./test_gloo_23")
        role23._get_size()
        role23._get_size()
        with open("test_fleet_gloo_role_maker_1.txt", "w") as f:
            data = "1 1 1 1\n"
            f.write(data)

        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
        dataset.set_use_var([show, label])
        dataset.load_into_memory()
        dataset.get_memory_data_size(fleet)
        dataset.get_shuffle_data_size(fleet)
        os.remove("./test_fleet_gloo_role_maker_1.txt")

        class TmpClass():
            """
            dummy tmp class
            """

            def __init__(self):
                pass

            def all_reduce_worker(self, input, output):
                """
                dummy all reduce worker

                Args:
                    input(None): fake input
                    output(None): fale output
                """
                pass

            def barrier_worker(self):
                """
                dummy barrier worker
                """
                pass

        from paddle.fluid.incubate.fleet.base.fleet_base import Fleet

        class TmpFleet(Fleet):
            """
            dummy tmp fleet
            """

            def __init__(self):
                super(Fleet, self).__init__()
                self._role_maker = None

            def init_worker(self):
                """
                dummy init worker
                """
                pass

            def init_server(self, model_dir=None):
                """
                dummy init server

                Args:
                    model_dir(None): fake model_dir
                """
                pass

            def run_server(self):
                """
                dummy run server
                """
                pass

            def stop_worker(self):
                """
                dummy stop worker
                """
                pass

            def distributed_optimizer(self, optimizer, strategy=None):
                """
                dummy distributed optimizer
                
                Args:
                    optimizer(None): fake optimizer
                    strategy(None): fake strategy
                """
                pass

            def save_inference_model(self):
                """
                dummy save inference model
                """
                pass

            def save_persistables(self):
                """
                dummy save persistables
                """
                pass

        os.environ["TRAINING_ROLE"] = "TRAINER"
        tmp = TmpFleet()
        tmp._role_maker = TmpClass()
        tmp.all_reduce_worker([], [])
        tmp.barrier_worker()
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
        tmp = RoleMakerBase()
        tmp.all_gather(1)
        tmp.all_reduce_worker([], [])
        tmp.barrier_worker()
        tmp.barrier_all()
        from paddle.fluid.incubate.fleet.base.role_maker import \
            MPISymetricRoleMaker
        tmp1 = MPISymetricRoleMaker()
        tmp1.all_gather(1)
        tmp1.all_gather(1)
        tmp2 = MPISymetricRoleMaker()
        tmp2.all_reduce_worker([], [])
        tmp3 = MPISymetricRoleMaker()
        tmp3.barrier_worker()
        tmp3.barrier_worker()
        tmp4 = MPISymetricRoleMaker()
        tmp4.barrier_all()
        tmp4.barrier_all()
예제 #9
0
    def test_pslib_1(self):
        """Test cases for pslib."""
        import paddle.fluid as fluid
        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        from paddle.fluid.incubate.fleet.parameter_server.pslib import \
            fleet_embedding, _prepare_params, _fleet_embedding, \
            _fleet_embedding_v2, FLEET_GLOBAL_DICT
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_pslib_1")
            return
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        role_maker = GeneralRoleMaker()
        role_maker.generate_role()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        fleet.init(role_maker)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        global FLEET_GLOBAL_DICT
        with fluid.program_guard(train_program, startup_program):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            click = fluid.layers.data(name="click", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            with fleet_embedding(click_name=click.name):
                emb = fluid.layers.embedding(input=show, size=[1, 1], \
                    is_sparse=True, is_distributed=True, \
                    param_attr=fluid.ParamAttr(name="embedding"))
            emb = fluid.layers.data_norm(input=emb,
                                         name="a",
                                         epsilon=1e-4,
                                         param_attr={
                                             "batch_size": 1e4,
                                             "batch_sum_default": 0.0,
                                             "batch_square": 1e4
                                         })
            fc = fluid.layers.fc(input=emb, size=1, act=None)
            label = fluid.layers.data(name="click", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            label_cast = fluid.layers.cast(label, dtype='float32')
            cost = fluid.layers.log_loss(fc, label_cast)
        try:
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            adam = fleet.distributed_optimizer(
                adam,
                strategy={
                    "embedding": {
                        "sparse_accessor_class": "DownpourSparseValueAccessor"
                    }
                })
            adam.minimize([cost], [scope])
        except:
            print("do not support pslib test, skip")
            return
        FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor"
        try:
            _prepare_params(input=show, size=[1, 1])
        except:
            print("catch expected exception of param_attr=None")
        try:
            _prepare_params(input=show,
                            size=[1, 1],
                            param_attr=fluid.ParamAttr())
        except:
            print("catch expected exception of name=None")
        try:
            tmp = fluid.ParamAttr(name="embedding")
            _prepare_params(input=show, size=1, param_attr=tmp)
        except:
            print("catch expected exception of size not list")
        try:
            tmp = fluid.ParamAttr(name="embedding")
            _prepare_params(input=show, size=[-1, 12], param_attr=tmp)
        except:
            print("catch expected exception of size not equal")
        try:
            tmp = fluid.ParamAttr(name="embedding")
            _prepare_params(input=show,
                            size=[-1, 1],
                            param_attr=tmp,
                            is_sparse=False)
        except:
            print("catch expected exception of is_sparse=False")
        try:
            tmp = fluid.ParamAttr(name="embedding")
            _prepare_params(input=show, size=[-1, 1], param_attr=tmp, \
                            is_sparse=True, is_distributed=False)
        except:
            print("catch expected exception of is_distributed=False")
        try:
            _prepare_params(input=show, size=[-1, 1], \
                            param_attr=fluid.ParamAttr(name="embedding"), \
                            is_sparse=True, is_distributed=True, dtype="abc")
        except:
            print("catch expected exception of unknown dtype")
        try:
            FLEET_GLOBAL_DICT["emb_to_accessor"]["embedding"] = "unknown"
            tmp = fluid.ParamAttr(name="embedding")
            _prepare_params(input=show, size=[-1, 1], param_attr=tmp)
        except:
            print("catch expected exception of unknown accessor")
        FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor"
        try:
            _fleet_embedding(input=show, size=[-1, 1], is_sparse=True, \
                             is_distributed=True, dtype="float32", \
                             param_attr=fluid.ParamAttr(name="embedding"))
        except:
            print("catch expected exception of unknown accessor")
        try:
            _fleet_embedding_v2(input=show, size=[-1, 1], is_sparse=True, \
                                is_distributed=True, dtype="float32", \
                                param_attr=fluid.ParamAttr(name="embedding"))
        except:
            print("catch expected exception of unknown accessor")

        adam1 = fluid.optimizer.Adam(learning_rate=0.000005)
        adam1 = fleet.distributed_optimizer(
            adam1,
            strategy={
                "embedding": {
                    "sparse_accessor_class": "DownpourSparseValueAccessor"
                }
            })
        try:
            pre = FLEET_GLOBAL_DICT["emb_to_table"]
            FLEET_GLOBAL_DICT["emb_to_table"] = {}
            adam1.minimize([cost], [scope])
        except:
            FLEET_GLOBAL_DICT["emb_to_table"] = pre
            print("catch expected exception of empty emb_to_table")
        try:
            pre = FLEET_GLOBAL_DICT["emb_to_table"]
            FLEET_GLOBAL_DICT["emb_to_table"] = {}
            FLEET_GLOBAL_DICT["emb_to_table"]["emb1"] = 0
            adam1.minimize([cost], [scope])
        except:
            FLEET_GLOBAL_DICT["emb_to_table"] = pre
            print("catch expected exception of error emb_to_table")
        try:
            adam2 = fluid.optimizer.Adam(learning_rate=0.000005)
            adam2 = fleet.distributed_optimizer(adam2)
            adam2.supported_embedding_types = []
            adam2.minimize([cost], [scope])
        except:
            print("catch expected exception of embedding_types")
        try:
            adam3 = fluid.optimizer.Adam(learning_rate=0.000005)
            adam3 = fleet.distributed_optimizer(
                adam3,
                strategy={
                    "embedding": {
                        "sparse_accessor_class": "DownpourSparseValueAccessor",
                        "sparse_embedx_dim": 999
                    }
                })
            adam3.minimize([cost], [scope])
        except:
            print("catch expected exception of embedx_dim error")

        try:
            adam4 = fluid.optimizer.Adam(learning_rate=0.000005)
            adam4 = fleet.distributed_optimizer(
                adam4,
                strategy={
                    "embedding": {
                        "sparse_accessor_class": "DownpourCtrAccessor",
                        "sparse_embedx_dim": 999
                    }
                })
            adam4.minimize([cost], [scope])
        except:
            print("catch expected exception of embedx_dim error")
        train_program1 = fluid.Program()
        startup_program1 = fluid.Program()
        FLEET_GLOBAL_DICT["emb_to_accessor"] = {}
        with fluid.program_guard(train_program1, startup_program1):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                dtype="int64", lod_level=1, append_batch_size=False)
            with fleet_embedding(click_name=click.name):
                emb = fluid.layers.embedding(input=show, size=[1, 1], \
                    is_sparse=True, is_distributed=True, \
                    param_attr=fluid.ParamAttr(name="embedding"))
            with fleet_embedding(click_name=click.name):
                emb1 = fluid.embedding(input=show, size=[1, 1], \
                    is_sparse=True, is_distributed=True, \
                    param_attr=fluid.ParamAttr(name="embedding"))
예제 #10
0
    thread_stat_var_names += [
        join_common_model.common_auc_stat_list[2].name,
        join_common_model.common_auc_stat_list[3].name
    ]
    thread_stat_var_names += [
        update_model.auc_stat_list[2].name, update_model.auc_stat_list[2].name
    ]
    thread_stat_var_names += [
        i.name for i in join_common_model.join_metric_list +
        join_common_model.common_metric_list + update_model.metric_list
    ]
    thread_stat_var_names = list(set(thread_stat_var_names))
    config_fleet.config['stat_var_names'] = thread_stat_var_names

    adam = fluid.optimizer.Adam(learning_rate=0.000005)
    adam = fleet.distributed_optimizer(adam, strategy=config_fleet.config)
    # adam = fleet.distributed_optimizer(adam, strategy={"use_cvm" : True, "adjust_ins_weight" : adjust_ins_weight, "scale_datanorm" : 1e-4, "dump_slot": True, "stat_var_names": thread_stat_var_names})
    # adam = fleet.distributed_optimizer(adam, strategy={"use_cvm" : True, "adjust_ins_weight" : adjust_ins_weight, "scale_datanorm" : 1e-4, "dump_slot": True, "stat_var_names": thread_stat_var_names, "fleet_desc_file": "reqi_fleet_desc"})
    adam.minimize([join_common_model.joint_cost, update_model.avg_cost],
                  [scope2, scope3])

    join_common_model._train_program._fleet_opt["program_configs"][str(
        id(join_common_model.joint_cost.block.program))]["push_sparse"] = []

    join_save_params = [
        "join.batch_size", "join.batch_sum", "join.batch_square_sum",
        "join_0.w_0", "join_0.b_0", "join_1.w_0", "join_1.b_0", "join_2.w_0",
        "join_2.b_0", "join_3.w_0", "join_3.b_0", "join_4.w_0", "join_4.b_0",
        "join_5.w_0", "join_5.b_0", "join_6.w_0", "join_6.b_0", "join_7.w_0",
        "join_7.b_0"
    ]
예제 #11
0
    def do_training(self, args=None):
        """do training"""
        avg_cost = self.net()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        fleet.init()
        # optimizer
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(optimizer, strategy={"fleet_desc_file": "./thirdparty/pslib/fleet_desc.prototxt"})
        optimizer.minimize(avg_cost)
        train_info = []
        # 启动server
        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            train_data_path = 'thirdparty/data/dist_data/pslib/train_data'
            train_data_files = []
            for filename in os.listdir(train_data_path):
                train_data_files.append(os.path.join(train_data_path, filename))
            # fleet dataset
            label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=1, append_batch_size=False)
            data = fluid.layers.data(name="1", shape=[1], dtype="int64", lod_level=1)
            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
            dataset.set_use_var([label, data])
            dataset.set_pipe_command("./python/bin/python ./thirdparty/pslib/dataset_generator.py")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist(train_data_files)
            # 把数据读到内存
            dataset.load_into_memory()
            # 本地shuffle
            dataset.local_shuffle()
            # 初始化worker配置
            fleet.init_worker()
            exe.run(fluid.default_startup_program())
            PASS_NUM = 1
            for pass_id in range(PASS_NUM):
                var_dict = {"loss": avg_cost}
                global var_dict

                class FetchVars(fluid.executor.FetchHandler):
                    def __init__(self, var_dict=None, period_secs=2):
                        super(FetchVars, self).__init__(var_dict, period_secs=2)

                    def handler(self, res_dict):
                        train_info.extend(res_dict["loss"])
                        print(train_info)

                exe.train_from_dataset(
                    program=fluid.default_main_program(),
                    dataset=dataset,
                    fetch_handler=FetchVars(var_dict))
            dataset.release_memory()
            fleet.shrink_sparse_table()
            fleet.shrink_dense_table(0.01, 11)
            fleet.print_table_stat(0)
            fleet.clear_one_table(0)
            fleet.clear_model()
        fleet.stop_worker()
        return train_info
예제 #12
0
    def test_in_memory_dataset_run_fleet(self):
        """
        Testcase for InMemoryDataset from create to run.
        """
        with open("test_in_memory_dataset_run_fleet_a.txt", "w") as f:
            data = "1 1 1 2 2 3 3 4 5 5 5 5 1 1\n"
            data += "1 0 1 3 2 3 4 4 6 6 6 6 1 2\n"
            data += "1 1 1 4 2 3 5 4 7 7 7 7 1 3\n"
            f.write(data)
        with open("test_in_memory_dataset_run_fleet_b.txt", "w") as f:
            data = "1 0 1 5 2 3 3 4 5 5 5 5 1 4\n"
            data += "1 1 1 6 2 3 4 4 6 6 6 6 1 5\n"
            data += "1 0 1 7 2 3 5 4 7 7 7 7 1 6\n"
            data += "1 1 1 8 2 3 6 4 8 8 8 8 1 7\n"
            f.write(data)

        slots = ["click", "slot1", "slot2", "slot3", "slot4"]
        slots_vars = []
        for slot in slots:
            var = fluid.layers.data(name=slot,
                                    shape=[1],
                                    dtype="int64",
                                    lod_level=1)
            slots_vars.append(var)
        click = slots_vars[0]
        embs = []
        for slot in slots_vars[1:3]:
            with fleet_embedding(click_name=click.name):
                emb = fluid.layers.embedding(input=slot, size=[-1, 11], \
                    is_sparse=True, is_distributed=True, \
                    param_attr=fluid.ParamAttr(name="embedding"))
                embs.append(emb)
        for slot in slots_vars[3:5]:
            with fleet_embedding(click_name=click.name):
                emb = fluid.embedding(input=slot, size=[-1, 11], \
                    is_sparse=True, is_distributed=True, \
                    param_attr=fluid.ParamAttr(name="embedding"))
                emb = fluid.layers.reshape(emb, [-1, 11])
                embs.append(emb)
        concat = fluid.layers.concat([embs[0], embs[3]], axis=1)
        fc = fluid.layers.fc(input=concat, size=1, act=None)
        label_cast = fluid.layers.cast(slots_vars[1], dtype='float32')
        cost = fluid.layers.log_loss(fc, label_cast)
        cost = fluid.layers.mean(cost)

        try:
            fleet.init()
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            adam = fleet.distributed_optimizer(adam)
            scope = fluid.Scope()
            adam.minimize([cost], [scope])
        except:
            print("do not support pslib test, skip")
            return

        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_batch_size(1)
        dataset.set_thread(2)
        dataset.set_filelist([
            "test_in_memory_dataset_run_fleet_a.txt",
            "test_in_memory_dataset_run_fleet_b.txt"
        ])
        dataset.set_pipe_command("cat")
        dataset.set_use_var(slots_vars)
        dataset.load_into_memory()

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())
        exe.train_from_dataset(fluid.default_main_program(), dataset)
        fleet._opt_info["stat_var_names"] = ["233"]
        exe.infer_from_dataset(fluid.default_main_program(), dataset)
        fleet._opt_info = None
        fleet._fleet_ptr = None
        os.remove("./test_in_memory_dataset_run_fleet_a.txt")
        os.remove("./test_in_memory_dataset_run_fleet_b.txt")