예제 #1
0
 def minimize(self,
              loss,
              startup_program=None,
              parameter_list=None,
              no_grad_set=None):
     """
     DownpounSGD is a distributed optimizer so
     that user can call minimize to generate backward
     operators and optimization operators within minmize function
     Args:
         loss(Variable): loss variable defined by user
         startup_program(Program): startup program that defined by user
         parameter_list(str list): parameter names defined by users
         no_grad_set(set): a set of variables that is defined by users
         so that these variables do not need gradient computation
     Returns:
         [ps_param, worker_skipped_ops]
         ps_param: parameter server protobuf desc
         worker_skipped_ops: operator names that need
         to be skipped during execution
     """
     params_grads = sorted(append_backward(loss, parameter_list,
                                           no_grad_set),
                           key=lambda x: x[0].name)
     table_name = find_distributed_lookup_table(loss.block.program)
     prefetch_slots = find_distributed_lookup_table_inputs(
         loss.block.program, table_name)
     prefetch_slots_emb = find_distributed_lookup_table_outputs(
         loss.block.program, table_name)
     server = DownpourServer()
     # window is communication strategy
     worker = DownpourWorker(self.window_)
     # Todo(guru4elephant): support multiple tables definitions
     # currently support one big sparse table
     sparse_table_index = 0
     # currently merge all dense parameters into one dense table
     dense_table_index = 1
     params = []
     grads = []
     for i in params_grads:
         params.append(i[0])
     for i in params_grads:
         grads.append(i[1])
     server.add_sparse_table(sparse_table_index, self.learning_rate_,
                             prefetch_slots, prefetch_slots_emb)
     server.add_dense_table(dense_table_index, self.learning_rate_, params,
                            grads)
     worker.add_sparse_table(sparse_table_index, self.learning_rate_,
                             prefetch_slots, prefetch_slots_emb)
     worker.add_dense_table(dense_table_index, self.learning_rate_, params,
                            grads)
     ps_param = pslib.PSParameter()
     ps_param.server_param.CopyFrom(server.get_desc())
     ps_param.trainer_param.CopyFrom(worker.get_desc())
     # Todo(guru4elephant): figure out how to support more sparse parameters
     # currently only support lookup_table
     worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
     ps_param.trainer_param.skip_op.extend(worker_skipped_ops)
     return [ps_param, worker_skipped_ops]
예제 #2
0
파일: downpour.py 프로젝트: neuzxy/Paddle
    def minimize(self,
                 losses,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
        """
        DownpounSGD is a distributed optimizer so
        that user can call minimize to generate backward
        operators and optimization operators within minimize function
        Args:
            loss(Variable): loss variable defined by user
            startup_program(Program): startup program that defined by user
            parameter_list(str list): parameter names defined by users
            no_grad_set(set): a set of variables that is defined by users
            so that these variables do not need gradient computation
        Returns:
            [ps_param, worker_skipped_ops]
            ps_param: parameter server protobuf desc
            worker_skipped_ops: operator names that need
            to be skipped during execution
        """
        if not isinstance(losses, list):
            raise ValueError('losses is a list, just lick [model.cost]')
        table_name = find_distributed_lookup_table(losses[0].block.program)
        prefetch_slots = find_distributed_lookup_table_inputs(
            losses[0].block.program, table_name)
        prefetch_slots_emb = find_distributed_lookup_table_outputs(
            losses[0].block.program, table_name)

        ps_param = pslib.PSParameter()
        server = DownpourServer()
        worker = DownpourWorker(self.window_)
        sparse_table_index = 0
        server.add_sparse_table(sparse_table_index, self.learning_rate_,
                                prefetch_slots, prefetch_slots_emb)
        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
                                prefetch_slots, prefetch_slots_emb)
        dense_table_index = 1
        program_configs = []
        param_grads_list = []
        for loss_index in range(len(losses)):
            program_config = ps_param.trainer_param.program_config.add()
            program_config.program_id = str(
                id(losses[loss_index].block.program))
            program_config.pull_sparse_table_id.extend([sparse_table_index])
            program_config.push_sparse_table_id.extend([sparse_table_index])
            params_grads = sorted(
                append_backward(losses[loss_index], parameter_list,
                                no_grad_set),
                key=lambda x: x[0].name)
            param_grads_list.append(params_grads)
            params = []
            grads = []
            data_norm_params = []
            data_norm_grads = []
            for i in params_grads:
                is_data_norm_data = False
                for data_norm_name in self.data_norm_name:
                    if i[0].name.endswith(data_norm_name):
                        is_data_norm_data = True
                        data_norm_params.append(i[0])
                if not is_data_norm_data:
                    params.append(i[0])
            for i in params_grads:
                is_data_norm_data = False
                for data_norm_grad in self.data_norm_name:
                    if i[0].name.endswith(data_norm_grad):
                        is_data_norm_data = True
                        data_norm_grads.append(i[1])
                if not is_data_norm_data:
                    grads.append(i[1])
            server.add_dense_table(dense_table_index, self.learning_rate_,
                                   params, grads)
            worker.add_dense_table(dense_table_index, self.learning_rate_,
                                   params, grads)
            program_config.pull_dense_table_id.extend([dense_table_index])
            program_config.push_dense_table_id.extend([dense_table_index])
            if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                dense_table_index += 1
                server.add_data_norm_table(dense_table_index,
                                           self.learning_rate_,
                                           data_norm_params, data_norm_grads)
                worker.add_dense_table(dense_table_index, self.learning_rate_,
                                       data_norm_params, data_norm_grads)
                program_config.pull_dense_table_id.extend([dense_table_index])
                program_config.push_dense_table_id.extend([dense_table_index])
            dense_table_index += 1
            program_configs.append(program_config)
        ps_param.server_param.CopyFrom(server.get_desc())
        ps_param.trainer_param.CopyFrom(worker.get_desc())
        for program_config in program_configs:
            ps_param.trainer_param.program_config.extend([program_config])
        # Todo(guru4elephant): figure out how to support more sparse parameters
        # currently only support lookup_table
        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
        ps_param.trainer_param.skip_op.extend(worker_skipped_ops)

        # all fleet operations should be defined in operators in the future
        # we want to return an object here containing:
        # 1) worker execution strategy
        # 2) pserver execution strategy
        # 3) fleet configurations
        # 4) skipped operators in runtime
        # 5) distributed optimization
        opt_info = {}
        opt_info["trainer"] = "DistMultiTrainer"
        opt_info["device_worker"] = "DownpourSGD"
        opt_info["optimizer"] = "DownpourSGD"
        opt_info["fleet_desc"] = ps_param
        opt_info["worker_skipped_ops"] = worker_skipped_ops

        for loss in losses:
            loss.block.program._fleet_opt = opt_info

        return None, param_grads_list
예제 #3
0
    def _minimize(self,
                  losses,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
        """
        DownpounSGD is a distributed optimizer so
        that user can call minimize to generate backward
        operators and optimization operators within minmize function
        Args:
            loss(Variable): loss variable defined by user
            startup_program(Program): startup program that defined by user
            parameter_list(str list): parameter names defined by users
            no_grad_set(set): a set of variables that is defined by users
            so that these variables do not need gradient computation
        Returns:
            [optimize_ops, grads_and_weights]
        """
        if not isinstance(losses, list):
            losses = [losses]

        table_name = find_distributed_lookup_table(losses[0].block.program)
        prefetch_slots = find_distributed_lookup_table_inputs(
            losses[0].block.program, table_name)
        prefetch_slots_emb = find_distributed_lookup_table_outputs(
            losses[0].block.program, table_name)

        ps_param = pslib.PSParameter()
        server = DownpourServer()
        worker = DownpourWorker(self.window_)
        sparse_table_index = 0
        server.add_sparse_table(sparse_table_index, self.learning_rate_,
                                prefetch_slots, prefetch_slots_emb)
        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
                                prefetch_slots, prefetch_slots_emb)
        dense_table_index = 1
        program_configs = {}
        param_grads_list = []

        for loss_index in range(len(losses)):
            #program_config = ps_param.trainer_param.program_config.add()
            #program_config.program_id = str(
            #    id(losses[loss_index].block.program))
            program_id = str(id(losses[loss_index].block.program))
            program_configs[program_id] = {
                "pull_sparse": [sparse_table_index],
                "push_sparse": [sparse_table_index]
            }

            #program_config.pull_sparse_table_id.extend([sparse_table_index])
            #program_config.push_sparse_table_id.extend([sparse_table_index])
            params_grads = sorted(fluid.backward.append_backward(
                losses[loss_index], parameter_list, no_grad_set),
                                  key=lambda x: x[0].name)
            param_grads_list.append(params_grads)
            params = []
            grads = []
            data_norm_params = []
            data_norm_grads = []
            for i in params_grads:
                is_data_norm_data = False
                for data_norm_name in self.data_norm_name:
                    if i[0].name.endswith(data_norm_name):
                        is_data_norm_data = True
                        data_norm_params.append(i[0])
                if not is_data_norm_data:
                    params.append(i[0])
            for i in params_grads:
                is_data_norm_data = False
                for data_norm_grad in self.data_norm_name:
                    if i[0].name.endswith(data_norm_grad):
                        is_data_norm_data = True
                        data_norm_grads.append(i[1])
                if not is_data_norm_data:
                    grads.append(i[1])
            server.add_dense_table(dense_table_index, self.learning_rate_,
                                   params, grads)
            worker.add_dense_table(dense_table_index, self.learning_rate_,
                                   params, grads)
            program_configs[program_id]["pull_dense"] = [dense_table_index]
            program_configs[program_id]["push_dense"] = [dense_table_index]
            #program_config.pull_dense_table_id.extend([dense_table_index])
            #program_config.push_dense_table_id.extend([dense_table_index])
            if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                dense_table_index += 1
                server.add_data_norm_table(dense_table_index,
                                           self.learning_rate_,
                                           data_norm_params, data_norm_grads)
                worker.add_dense_table(dense_table_index, self.learning_rate_,
                                       data_norm_params, data_norm_grads)
                #program_config.pull_dense_table_id.extend([dense_table_index])
                #program_config.push_dense_table_id.extend([dense_table_index])
                program_configs[program_id]["pull_dense"].extend(
                    [dense_table_index])
                program_configs[program_id]["push_dense"].extend(
                    [dense_table_index])
            dense_table_index += 1
            #program_configs.append(program_config)
        ps_param.server_param.CopyFrom(server.get_desc())
        ps_param.trainer_param.CopyFrom(worker.get_desc())
        #for program_config in program_configs:
        #    ps_param.trainer_param.program_config.extend([program_config])
        # Todo(guru4elephant): figure out how to support more sparse parameters
        # currently only support lookup_table
        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
        ps_param.trainer_param.skip_op.extend(worker_skipped_ops)

        opt_info = {}
        opt_info["program_configs"] = program_configs
        opt_info["trainer"] = "DistMultiTrainer"
        opt_info["device_worker"] = "DownpourSGD"
        opt_info["optimizer"] = "DownpourSGD"
        opt_info["fleet_desc"] = ps_param
        opt_info["worker_skipped_ops"] = worker_skipped_ops

        for loss in losses:
            loss.block.program._fleet_opt = opt_info

        return None, param_grads_list[0], opt_info
예제 #4
0
    def transpile(self,
                  trainer_id,
                  program=None,
                  pservers="127.0.0.1:6174",
                  trainers=1,
                  sync_mode=True,
                  startup_program=None,
                  current_endpoint="127.0.0.1:6174"):
        """
        Run the transpiler. Transpile the input program.

        Args:
            trainer_id (int): id for current trainer worker, if you have
                n workers, the id may range from 0 ~ n-1
            program (Program|None): program to transpile,
                default is fluid.default_main_program().
            startup_program (Program|None): startup_program to transpile,
                default is fluid.default_startup_program().
            pservers (str): comma separated ip:port string for the pserver
                list.
            trainers (int|str): in pserver mode this is the number of
                trainers.
            sync_mode (bool): Do sync training or not, default is True.
            startup_program (Program|None): startup_program to transpile,
                default is fluid.default_main_program().
            current_endpoint (str): In pserver mode
                this argument is not used.

        Examples:
            .. code-block:: python

                transpiler = fluid.DistributeTranspiler()
                t.transpile(
                    trainer_id=0,
                    pservers="127.0.0.1:7000,127.0.0.1:7001",
                    trainers=2,
                    sync_mode=False,
                    current_endpoint="127.0.0.1:7000")
        """
        if program is None:
            program = default_main_program()
        if startup_program is None:
            startup_program = default_startup_program()
        self.origin_program = program
        self.startup_program = startup_program
        self.origin_startup_program = self.startup_program.clone()

        self.trainer_num = trainers
        self.sync_mode = sync_mode
        self.trainer_id = trainer_id
        pserver_endpoints = pservers.split(",")
        self.pserver_endpoints = pserver_endpoints
        self.vars_overview = VarsDistributed()
        self.optimize_ops, self.params_grads = self._get_optimize_pass()

        ps_dispatcher = self.config.split_method(self.pserver_endpoints)
        self.table_name = find_distributed_lookup_table(self.origin_program)
        self.has_distributed_lookup_table = self.table_name != None
        self.param_name_to_grad_name = dict()
        self.grad_name_to_param_name = dict()
        for param_var, grad_var in self.params_grads:
            self.param_name_to_grad_name[param_var.name] = grad_var.name
            self.grad_name_to_param_name[grad_var.name] = param_var.name

        # get all sparse update ops
        self.sparse_update_ops = self._get_all_remote_sparse_update_op(
            self.origin_program)
        # use_sparse_update_param_name -> split_height_section
        self.sparse_param_to_height_sections = dict()

        # add distributed attrs to program
        self.origin_program._is_distributed = True
        self.origin_program._endpoints = self.pserver_endpoints
        self.origin_program._ps_endpoint = current_endpoint
        self.origin_program._is_chief = self.trainer_id == 0
        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None

        # split and create vars, then put splited vars in dicts for later use.
        # step 1: split and create vars, then put splited vars in dicts for later use.
        self._init_splited_vars()

        # step 2: insert send op to send gradient vars to parameter servers
        ps_dispatcher.reset()
        send_vars = []

        # in general cases, the number of pservers is times of 2, and this
        # will lead to uneven distribution among weights and bias:
        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
        # shuffle the map will avoid the uneven distribution above

        self.opti_name_to_send_dummy_out = dict()
        self.recv_program = self.origin_program.clone()
        all_ops = []
        for op in self.recv_program.global_block().ops:
            all_ops.append(op)
        delete_ops(self.recv_program.global_block(), all_ops)

        self.split_num = len(program.global_block().ops)
        for opti_varname in self._opti_var_list:
            opti_var = program.global_block().var(opti_varname)
            eplist = ps_dispatcher.dispatch([opti_var])

            dummy_output = program.global_block().create_var(
                name=framework.generate_control_dev_var_name())
            self.opti_name_to_send_dummy_out[opti_varname] = dummy_output

            program.global_block().append_op(
                type="send",
                inputs={"X": [opti_var]},
                outputs={"Out": dummy_output},
                attrs={
                    "epmap":
                    eplist,
                    RPC_OP_ROLE_ATTR_NAME:
                    RPC_OP_ROLE_ATTR_VALUE,
                    OP_ROLE_VAR_ATTR_NAME:
                    [self._opti_to_param[opti_varname], opti_varname],
                    "sync_mode":
                    not self.sync_mode,
                })
            send_vars.append(opti_var)

        if self.sync_mode:
            send_barrier_out = program.global_block().create_var(
                name=framework.generate_control_dev_var_name())
            input_deps = list(self.opti_name_to_send_dummy_out.values())

            program.global_block().append_op(type="send_barrier",
                                             inputs={"X": list(input_deps)},
                                             outputs={"Out": send_barrier_out},
                                             attrs={
                                                 "endpoints":
                                                 pserver_endpoints,
                                                 "sync_mode":
                                                 self.sync_mode,
                                                 "trainer_id":
                                                 self.trainer_id,
                                                 RPC_OP_ROLE_ATTR_NAME:
                                                 RPC_OP_ROLE_ATTR_VALUE
                                             })

        # step 3: insert recv op to receive parameters from parameter server
        recv_vars = []
        for _, var in enumerate(send_vars):
            recv_vars.append(program.global_block().var(
                self._opti_to_param[var.name]))
        ps_dispatcher.reset()
        eplist = ps_dispatcher.dispatch(recv_vars)
        for i, ep in enumerate(eplist):
            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
            self.param_grad_ep_mapping[ep]["opti"].append(send_vars[i])

            distributed_var = self.vars_overview.get_distributed_var_by_slice(
                recv_vars[i].name)
            distributed_var.endpoint = ep

        # step4: Concat the parameters splits together after recv.
        all_recv_outputs = []
        for opti_varname in self._opti_var_list:
            opti_var = program.global_block().var(opti_varname)
            param_varname = self._opti_to_param[opti_varname]
            param_var = program.global_block().var(param_varname)
            eps = []
            table_names = []
            index = [v.name for v in recv_vars].index(param_varname)
            eps.append(eplist[index])
            table_names.append(var.name)
            if self.sync_mode:
                recv_dep_in = send_barrier_out
            # get recv op_role_var, if not splited, the grad should have .trainer suffix
            # if splited, grad should be the original grad var name. ParallelExecutor
            # will use op_role_var to get expected device place to run this op.

            all_recv_outputs.extend([param_var])
            self.recv_program.global_block().append_op(
                type="recv",
                inputs={"X": []},
                outputs={"Out": [param_var]},
                attrs={
                    "epmap": eps,
                    "trainer_id": self.trainer_id,
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                    OP_ROLE_VAR_ATTR_NAME: [param_varname, opti_varname],
                    "sync_mode": not self.sync_mode
                })

        if self.sync_mode:
            # form a WAW dependency
            self.recv_program.global_block()._insert_op(
                index=len(self._opti_var_list),
                type="fetch_barrier",
                inputs={},
                outputs={"Out": all_recv_outputs},
                attrs={
                    "endpoints": pserver_endpoints,
                    "trainer_id": self.trainer_id,
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                })

        self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)

        self._get_distributed_optimizer_vars()
        self.origin_program._parameters_on_pservers = self.vars_overview
    def _minimize(self,
                  losses,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None,
                  strategy={}):
        """
        DownpounSGD is a distributed optimizer so
        that user can call minimize to generate backward
        operators and optimization operators within minmize function
        Args:
            loss(Variable): loss variable defined by user
            startup_program(Program): startup program that defined by user
            parameter_list(str list): parameter names defined by users
            no_grad_set(set): a set of variables that is defined by users
            so that these variables do not need gradient computation
            strategy(dict): user-defined properties
        Returns:
            [optimize_ops, grads_and_weights]
        """

        table_name = find_distributed_lookup_table(losses[0].block.program)
        prefetch_slots = find_distributed_lookup_table_inputs(
            losses[0].block.program, table_name)
        prefetch_slots_emb = find_distributed_lookup_table_outputs(
            losses[0].block.program, table_name)

        ps_param = pslib.PSParameter()
        server = DownpourServer()
        worker = DownpourWorker(self._window)
        # if user specify a fleet_desc.prototxt file, then load the file
        # instead of creating default fleet_desc.prototxt.
        # user can specify server_param or trainer_param or fs_client_param.
        if strategy.get("fleet_desc_file") is not None:
            fleet_desc_file = strategy["fleet_desc_file"]
            with open(fleet_desc_file) as f:
                text_format.Merge(f.read(), ps_param)
            server.get_desc().CopyFrom(ps_param.server_param)
            worker.get_desc().CopyFrom(ps_param.trainer_param)
        sparse_table_index = 0
        server.add_sparse_table(sparse_table_index, self._learning_rate,
                                prefetch_slots, prefetch_slots_emb)
        worker.add_sparse_table(sparse_table_index, self._learning_rate,
                                prefetch_slots, prefetch_slots_emb)
        dense_table_index = 1
        program_configs = {}
        param_grads_list = []

        for loss_index in range(len(losses)):
            program_id = str(id(losses[loss_index].block.program))
            program_configs[program_id] = {
                "pull_sparse": [sparse_table_index],
                "push_sparse": [sparse_table_index]
            }

            params_grads = sorted(
                fluid.backward.append_backward(losses[loss_index],
                                               parameter_list, no_grad_set),
                key=lambda x: x[0].name)
            param_grads_list.append(params_grads)
            params = []
            grads = []
            data_norm_params = []
            data_norm_grads = []
            for i in params_grads:
                is_data_norm_data = False
                for data_norm_name in self.data_norm_name:
                    if i[0].name.endswith(data_norm_name):
                        is_data_norm_data = True
                        data_norm_params.append(i[0])
                if not is_data_norm_data:
                    params.append(i[0])
            for i in params_grads:
                is_data_norm_data = False
                for data_norm_grad in self.data_norm_name:
                    if i[0].name.endswith(data_norm_grad):
                        is_data_norm_data = True
                        data_norm_grads.append(i[1])
                if not is_data_norm_data:
                    grads.append(i[1])
            server.add_dense_table(dense_table_index, self._learning_rate,
                                   params, grads)
            worker.add_dense_table(dense_table_index, self._learning_rate,
                                   params, grads)
            program_configs[program_id]["pull_dense"] = [dense_table_index]
            program_configs[program_id]["push_dense"] = [dense_table_index]
            if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                dense_table_index += 1
                server.add_data_norm_table(dense_table_index,
                                           self._learning_rate,
                                           data_norm_params, data_norm_grads)
                worker.add_dense_table(dense_table_index, self._learning_rate,
                                       data_norm_params, data_norm_grads)
                program_configs[program_id]["pull_dense"].extend(
                    [dense_table_index])
                program_configs[program_id]["push_dense"].extend(
                    [dense_table_index])
            dense_table_index += 1
        ps_param.server_param.CopyFrom(server.get_desc())
        ps_param.trainer_param.CopyFrom(worker.get_desc())
        # Todo(guru4elephant): figure out how to support more sparse parameters
        # currently only support lookup_table
        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
        if len(ps_param.trainer_param.skip_op) == 0:
            ps_param.trainer_param.skip_op.extend(worker_skipped_ops)

        opt_info = {}
        opt_info["program_configs"] = program_configs
        opt_info["trainer"] = "DistMultiTrainer"
        opt_info["device_worker"] = "DownpourSGD"
        opt_info["optimizer"] = "DownpourSGD"
        opt_info["fleet_desc"] = ps_param
        opt_info["worker_skipped_ops"] = worker_skipped_ops
        opt_info["use_cvm"] = strategy.get("use_cvm", False)

        for loss in losses:
            loss.block.program._fleet_opt = opt_info

        return None, param_grads_list[0], opt_info