Пример #1
0
    def __init__(self, program_func, optimizer, param_path=None, place=None):
        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py
        self.scope = core.Scope()

        self.startup_program = framework.Program()
        self.train_program = framework.Program()

        with framework.program_guard(self.train_program, self.startup_program):
            loss = program_func()
            if not isinstance(optimizer, opt_module.Optimizer):
                raise TypeError(
                    "The optimizer should be an instance of Optimizer")

            optimize_ops, params_grads = optimizer.minimize(loss)

        self.place = Trainer._check_and_get_place(place)

        self.dist_transpile_if_necessary(optimize_ops, params_grads)

        # 2. move the default_main_program to self.program and run the
        # default_startup program on an empty core.Scope()
        # Run startup program
        with self._prog_and_scope_guard():
            exe = executor.Executor(place)
            exe.run(self.startup_program)

        if param_path:
            # load params from param_path into scope
            io.load_persistables(exe, dirname=param_path)
Пример #2
0
    def __init__(self, infer_func, param_path, place=None, parallel=False):
        """
        :param infer_func: a function that will return predict Variable
        :param param_path: the path where the inference model is saved by fluid.io.save_params
        :param place: place to do the inference
        :param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
        """
        self.param_path = param_path
        self.scope = core.Scope()
        self.parallel = parallel
        self.place = check_and_get_place(place)

        self.inference_program = framework.Program()
        with framework.program_guard(self.inference_program):
            with unique_name.guard():
                self.predict_var = infer_func()

        with self._prog_and_scope_guard():
            # load params from param_path into scope
            io.load_params(executor.Executor(self.place), param_path)

        if parallel:
            with self._prog_and_scope_guard():
                self.exe = parallel_executor.ParallelExecutor(
                    use_cuda=isinstance(self.place, core.CUDAPlace),
                    loss_name=self.predict_var.name)
        else:
            self.exe = executor.Executor(self.place)
Пример #3
0
    def __init__(self,
                 train_func,
                 optimizer,
                 param_path=None,
                 place=None,
                 parallel=False):
        self.__stop = False
        self.parallel = parallel
        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py
        if not isinstance(optimizer, opt_module.Optimizer):
            raise TypeError("The optimizer should be an instance of Optimizer")

        self.scope = core.Scope()

        self.startup_program = framework.Program()
        self.train_program = framework.Program()

        with framework.program_guard(self.train_program, self.startup_program):
            program_func_outs = train_func()
            self.train_func_outputs = program_func_outs if isinstance(
                program_func_outs, list) else [program_func_outs]
            self.test_program = self.train_program.clone()
            if not isinstance(optimizer, opt_module.Optimizer):
                raise TypeError(
                    "The optimizer should be an instance of Optimizer")
            # The fisrt element of program_func_outs is loss.
            loss = self.train_func_outputs[0]
            optimize_ops, params_grads = optimizer.minimize(loss)

        self.place = check_and_get_place(place)

        self._dist_transpile_if_necessary(optimize_ops, params_grads)

        # 2. move the default_main_program to self.program and run the
        # default_startup program on an empty core.Scope()
        # Run startup program
        with self._prog_and_scope_guard():
            exe = executor.Executor(place)
            exe.run(self.startup_program)

        if param_path:
            # load params from param_path into scope
            io.load_persistables(exe, dirname=param_path)
Пример #4
0
def _load_lookup_table_vars(executor, dirname, program, pserver_id,
                            table_name):
    """
    The parameter server will load lookup table's local file in 
    selectedrows variable.

    Args:
        executor(Executor): The executor to run for loading persistable variables
        dirname(str): The directory path
        main_program(Program): Find the variable named table_name in main_program
        pserver_id(int): the serial number in pserver_endpoints list
        table_name(str): lookup table name

    Returns:
        None

    Examples:
        .. code-block:: python

            exe = fluid.Executor(fluid.CPUPlace())
            dirname = "./checkpoints/checkpoint_9/"
            prog = fluid.default_main_program()
            pserver_id = 1
            table_name = "share_w"
            _load_lookup_table_vars(executor=exe,
                    dirname=dirname, program=prog, pserver_id=pserver_id,
                    table_name=table_name)
    """

    for var in program.list_vars():
        if var.name == table_name:
            lookup_table_var = var
            break

    assert lookup_table_var is not None

    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)

    load_prog = framework.Program()
    load_block = load_prog.global_block()

    load_block.append_op(
        type='load',
        inputs={},
        outputs={'Out': [lookup_table_var]},
        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})

    executor.run(load_prog)
Пример #5
0
def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
                                 ps_endpoint_list):
    """
    This function will send checkpoint notify message from Trainer 0
    to all the pservers.
    The checkpoint notify message contains lookup table name, 
    the absolute path on pserver to save lookup_table.

    Args:
        executor(Executor): The executor to run for send checkpoint notify.
        dirname(str): The folder where to save checkpoints.
        lookup_table(string): the lookup table name, when use distribute
            lookup table, we can get lookup table name by DistributeTranspiler.
            table_name 
        ps_endpoint_list(list): the parameter server ip:port list.  
            when use distribute lookup table, we can get ps_endpoint_list by 
            distribute arguments.
    Return:
        None
    
    Examples:
        .. code-block:: python

            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
            prog = fluid.default_main_program()
            table_name = "share_w"
            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]

            _save_pserver_vars_by_notify(executor=exe,
                    dirname=param_path, lookup_table=table_name, 
                    ps_endpoint_list=ps_endpoints)
    """
    cur_dir = _get_lookuptable_dir(dirname)

    checkpoint_notify_program = framework.Program()
    checkpoint_notify_block = checkpoint_notify_program.global_block()

    attrs = {}
    attrs['epmap'] = ps_endpoint_list
    attrs['dir'] = cur_dir
    attrs['lookup_table'] = lookup_table

    checkpoint_notify_block.append_op(type='checkpoint_notify',
                                      inputs={},
                                      outputs={},
                                      attrs=attrs)
    executor.run(checkpoint_notify_program)
Пример #6
0
    def __init__(self, network_func, param_path=None, place=None):
        # 1. we need to generate a framework.Program by calling
        # network_func. Reference: fluid.program_guard in test_word2vec.py

        # 2. move the default_main_program to self.program.

        # 3. run the default_startup program.

        # 4. load params from param_path into scope
        self.scope = core.Scope()
        self.place = place
        self.startup_program = framework.Program()
        # TODO: generate the startup_program with network_func

        exe = executor.Executor(place)
        exe.run(self.startup_program, scope=self.scope)

        if param_path:
            # load params from param_path into scope
            io.load_persistables(exe, dirname=param_path)
Пример #7
0
    def __init__(self, infer_func, param_path, place=None, parallel=False):
        self.param_path = param_path
        self.scope = core.Scope()
        self.parallel = parallel
        self.place = check_and_get_place(place)

        self.inference_program = framework.Program()
        with framework.program_guard(self.inference_program):
            with unique_name.guard():
                self.predict_var = infer_func()

        with self._prog_and_scope_guard():
            # load params from param_path into scope
            io.load_params(executor.Executor(self.place), param_path)

        if parallel:
            with self._prog_and_scope_guard():
                self.exe = parallel_executor.ParallelExecutor(
                    use_cuda=isinstance(self.place, core.CUDAPlace),
                    loss_name=self.predict_var.name)
        else:
            self.exe = executor.Executor(self.place)

        self.inference_program = self.inference_program.clone(for_test=True)
Пример #8
0
    def __init__(self,
                 train_func,
                 optimizer_func,
                 param_path=None,
                 place=None,
                 parallel=False,
                 checkpoint_config=None):
        self.__stop = False
        self.parallel = parallel

        # config for checkpoint
        # only chief worker will save variables
        self.trainer_id = 0
        self.checkpoint_cfg = checkpoint_config
        if self.checkpoint_cfg:
            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
            serial = io.get_latest_checkpoint_serial(
                self.checkpoint_cfg.checkpoint_dir)
            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None

        self.scope = core.Scope()

        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py

        self.startup_program = framework.Program()
        self.train_program = framework.Program()

        with framework.program_guard(self.train_program, self.startup_program):
            program_func_outs = train_func()
            self.train_func_outputs = program_func_outs if isinstance(
                program_func_outs, list) else [program_func_outs]
            self.test_program = self.train_program.clone(for_test=True)

            # The first element of program_func_outs is loss.
            loss = self.train_func_outputs[0]

            optimizer = optimizer_func()
            if not isinstance(optimizer, opt_module.Optimizer):
                raise TypeError(
                    "The optimizer should be an instance of Optimizer")
            optimize_ops, params_grads = optimizer.minimize(loss)

        self.place = check_and_get_place(place)

        self._dist_transpile_if_necessary(optimize_ops, params_grads)

        # 2. move the default_main_program to self.program and run the
        # default_startup program on an empty core.Scope()
        # Run startup program
        with self._prog_and_scope_guard():
            exe = executor.Executor(place)
            exe.run(self.startup_program)

        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
            with self._prog_and_scope_guard():
                exe = executor.Executor(place)
                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
                                   self.checkpoint_cfg.load_serial,
                                   self.startup_program)

            if not self.checkpoint_cfg.is_pserver:
                epoch_id, step_id = io.load_trainer_args(
                    self.checkpoint_cfg.checkpoint_dir,
                    self.checkpoint_cfg.load_serial, self.trainer_id,
                    self._get_checkpoint_load_args())
                self.checkpoint_cfg.epoch_id = int(epoch_id)
                self.checkpoint_cfg.step_id = int(step_id)

        if param_path and os.path.isdir(param_path):
            # load params from param_path into scope
            io.load_persist_vars_without_grad(
                exe, dirname=param_path, program=self.startup_program)