예제 #1
0
def get_distributed_from_listen_and_serv(program, origin_program):
    op = get_op_by_type(program.global_block(), "listen_and_serv")
    sparse_varnames = get_sparse_tablenames(origin_program, True)
    sparse_params = []
    grad_to_params = op.attr('sparse_grad_to_param')
    for grad_to_param in grad_to_params:
        _, param = grad_to_param.split(":")
        if _orig_varname(param) in sparse_varnames:
            sparse_params.append(param)
    return sparse_params
예제 #2
0
 def _save_sparse_params(self, executor, dirname, context, main_program,
                         mode):
     from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
     distributed_varnames = get_sparse_tablenames(
         self.compiled_strategy.origin_main_program, True)
     values = []
     for id, names in context.items():
         if names not in distributed_varnames:
             # only save sparse param to local
             self._worker.recv_and_save_model(id, dirname)
         # save sparse & distributed param on server
         self._worker.save_one_model(id, dirname, mode)
         values.extend(names)
     return values
예제 #3
0
            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(self.origin_main_program,
                                                      True)
                sparse_varnames = get_sparse_tablenames(
                    self.origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self.origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self.origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)
예제 #4
0
def add_geo_optimizer_pass(program, config):
    endpoint = config.get_ps_endpoint()
    params = [p for p in config.param_grad_ep_mapping[endpoint]["params"]]

    sparse_tablenames = get_sparse_tablenames(config.get_origin_main_program(),
                                              False)

    for param in params:
        _clone_var(program.global_block(), param)

    optimize_block = []
    sparse_grad_to_param = []
    param_to_block_id = []
    pre_block_idx = program.num_blocks - 1

    for param in params:
        per_opt_block = program._create_block(pre_block_idx)
        optimize_block.append(per_opt_block)
        var_name = param.name
        pserver_block = per_opt_block.program.global_block()
        param = pserver_block.vars[var_name]

        delta_var_name = "%s.delta" % (param.name)
        origin_varname = _orig_varname(param.name)

        if origin_varname in sparse_tablenames:
            sparse_grad_to_param.append(":".join([delta_var_name, param.name]))

        delta_var = pserver_block.create_var(
            name=delta_var_name,
            persistable=False,
            type=param.type,
            dtype=param.dtype,
            shape=param.shape)

        per_opt_block.append_op(
            type="sum",
            inputs={"X": [param, delta_var]},
            outputs={"Out": param})

        param_to_block_id.append(delta_var_name + ":" + str(per_opt_block.idx))

    op = get_op_by_type(program.global_block(), "listen_and_serv")
    op._set_attr("optimize_blocks", optimize_block)
    op._set_attr("grad_to_block_id", param_to_block_id)
    op._set_attr("sparse_grad_to_param", sparse_grad_to_param)

    return program
예제 #5
0
    def _init_server(self, dirname=None, var_names=None, **kwargs):
        if self.role_maker._is_heter_worker():
            self._init_heter_worker()
            return
        role_id = self.compiled_strategy.get_role_id()
        endpoints = self.compiled_strategy.get_ps_endpoints()
        is_sync = self.compiled_strategy.is_sync_mode()
        trainers = self.compiled_strategy.get_trainers()

        server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
        proto_txt = str(server)

        debug = bool(os.getenv("PSERVER_DEBUG", "0"))
        if debug:
            print("server: \n{}".format(proto_txt))

        string_hosts = []
        for idx, ep in enumerate(endpoints):
            host, port = ep.split(":")
            pshost = fluid.core.PSHost(host, int(port), idx)
            string_hosts.append(pshost.serialize_to_string())

        self._server = fluid.core.DistFleetWrapper()
        self._server.init_server(proto_txt, string_hosts, role_id, trainers,
                                 self._server_sub_program)

        from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames

        dist_varnames = get_sparse_tablenames(self.origin_main_program, True)
        sparse_varnames = get_sparse_tablenames(self.origin_main_program,
                                                False)

        distributed_varnames = dist_varnames + sparse_varnames

        if var_names is None:
            load_varnames = distributed_varnames
        else:
            for var_name in var_names:
                if var_name not in distributed_varnames:
                    raise ValueError(
                        "fleet.init server can only load sparse variables in {}"
                        .format(distributed_varnames))
            load_varnames = var_names

        if dirname is None or not load_varnames:
            return

        sparse_table_maps = {}
        for table in server.servers[0].tables:
            if table.type == "PS_SPARSE_TABLE" and table.common is not None:
                sparse_table_maps[table.common.table_name] = table.id

        dirname = os.path.normpath(dirname)
        pserver_id = self.role_maker._role_id()

        import time
        begin = time.time()
        for var_name in load_varnames:
            table_id = sparse_table_maps[var_name]
            path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
                                "{}.block{}.txt".format(var_name, pserver_id))
            meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
                                "{}.block{}.meta".format(var_name, pserver_id))
            self._server.load_sparse(path, meta, table_id)
        end = time.time()
        print("init sparse variables: {} cost time: {}".format(
            load_varnames, end - begin))
예제 #6
0
 def _get_sparse_table_names():
     dist_varnames = get_sparse_tablenames(origin_program, True)
     sparse_varnames = get_sparse_tablenames(origin_program, False)
     return list(set(dist_varnames + sparse_varnames))
예제 #7
0
def large_scale_sparse_pass(program, main_program, config, is_startup=False):
    opt_value_map = {}
    opt_value_map["sgd"] = ["Param"]
    opt_value_map["adam"] = ["Param", "Moment1", "Moment2"]
    opt_value_map["adagrad"] = ["Param", "Moment"]
    opt_value_map["adamax"] = ["Param", "Moment", "InfNorm"]
    opt_value_map["momentum"] = ["Param", "Velocity"]
    opt_value_map["lars_momentum"] = ["Param", "Velocity"]
    opt_value_map["rmsprop"] = ["Param", "Moment", "MeanSquare"]
    opt_value_map["decayed_adagrad"] = ["Param", "Moment"]
    opt_value_map["ftrl"] = ["Param", "SquaredAccumulator", "LinearAccumulator"]

    geo_value_map = {}
    geo_value_map["sum"] = "Param"

    opt_init_map = {}
    opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
    opt_init_map["fill_constant"] = ["value"]
    opt_init_map["uniform_random"] = ["seed", "min", "max"]
    opt_init_map["truncated_gaussian_random"] = ["seed", "mean", "std"]

    def get_entry_attr(param_name):
        origin_name = _orig_varname(param_name)
        o_main_program = config.get_origin_main_program()
        for op in o_main_program.global_block().ops:
            if is_distributed_sparse_op(op) and get_sparse_tablename(
                    op) == origin_name:
                entry = op.attr("entry")
                return entry

    def get_initializer_attrs(acture_value_names):
        l_sep = ","
        l_in = "&"
        init_attrs = []
        o_startup_program = config.get_origin_startup_program()

        for value_name in acture_value_names:
            origin_var_name = _orig_varname(value_name)
            for op in o_startup_program.global_block().ops:
                if op.type in opt_init_map.keys(
                ) and origin_var_name == op.output("Out")[0]:
                    init_attr = [op.type]
                    for attr in opt_init_map[op.type]:
                        init_attr.append(str(op.attr(attr)))
                    init_attrs.append(l_in.join(init_attr))
                    break

        return l_sep.join(init_attrs)

    def get_optimizer_values(block):
        value_names = []
        acture_names = []
        value_dims = []
        grad = None
        opt_idx = -1
        fuse = False

        for op in block.ops:
            opt_idx += 1

            if op.type not in opt_value_map.keys():
                continue

            if op.type in ["sgd", "adam"]:
                fuse = True

            grad = main_program.global_block().vars[op.input("Grad")[0]]

            for value in opt_value_map[op.type]:
                var = main_program.global_block().vars[op.input(value)[0]]
                if len(var.shape) != 2:
                    raise ValueError("sparse param's dimension must be 2")

                value_names.append(value)
                value_dims.append(var.shape[1])
                acture_names.append(var.name)

            if value_names:
                break
        return grad, opt_idx, value_names, value_dims, acture_names, fuse

    def add_fuse_large_scale_op(block, global_block, table_name, value_names,
                                acture_names, grad, is_entry, opt_idx):

        op = block.ops[opt_idx]

        if op.type == "sgd":
            grad = main_program.global_block().vars[op.input("Grad")[0]]
            lr = main_program.global_block().vars[op.input("LearningRate")[0]]

            block._insert_op(
                opt_idx,
                type="lookup_sparse_table_fuse_sgd",
                inputs={"Grad": grad,
                        "LearningRate": lr},
                attrs={
                    "is_entry": is_entry,
                    "tablename": table_name,
                    "value_names": value_names
                })

        elif op.type == "adam":
            grad = main_program.global_block().vars[op.input("Grad")[0]]
            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
                0]]
            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
                0]]
            beta1_pow_o = main_program.global_block().vars[op.output(
                "Beta1PowOut")[0]]
            beta2_pow_o = main_program.global_block().vars[op.output(
                "Beta2PowOut")[0]]

            beta1 = op.attr('beta1')
            beta2 = op.attr('beta2')
            epsilon = op.attr('epsilon')

            block._insert_op(
                opt_idx,
                type="lookup_sparse_table_fuse_adam",
                inputs={
                    "Grad": grad,
                    "LearningRate": lr,
                    "Beta1Pow": beta1_pow,
                    "Beta2Pow": beta2_pow
                },
                outputs={
                    "Beta1PowOut": beta1_pow_o,
                    "Beta2PowOut": beta2_pow_o
                },
                attrs={
                    "beta1": beta1,
                    "beta2": beta2,
                    "epsilon": epsilon,
                    "is_entry": is_entry,
                    "tablename": table_name,
                    "value_names": value_names
                })
        else:
            raise ValueError("only support sgd/adam optimizer now")

    def add_large_scale_op(block, global_block, table_name, value_names,
                           acture_names, grad, is_entry, opt_idx):
        ids = global_block.create_var(
            name="kSparseIDs@{}".format(table_name),
            persistable=False,
            dtype="int64",
            shape=[1, 1],
            lod_level=0)

        # insert grad split to ids and tensor op
        block._insert_op(
            opt_idx,
            type="lookup_sparse_table_grad_split",
            inputs={"Grad": grad},
            outputs={"Row": ids,
                     "Value": grad},
            attrs={"tablename": table_name,
                   "is_entry": is_entry})

        # insert read at first
        vars = [global_block.vars[acture_name] for acture_name in acture_names]
        block._insert_op(
            opt_idx + 1,
            type="lookup_sparse_table_read",
            inputs={"Ids": ids},
            outputs={"Out": vars},
            attrs={"tablename": table_name,
                   "value_names": value_names})

        # append write at last
        inputs = {"Ids": ids, "In": vars}

        block.append_op(
            type="lookup_sparse_table_write",
            inputs=inputs,
            outputs={},
            attrs={"tablename": table_name,
                   "value_names": value_names})

    op = get_op_by_type(main_program.global_block(), "listen_and_serv")

    param_blockid_map = {}
    grad_blockid_map = {}
    grad_to_params = op.attr('sparse_grad_to_param')
    grad_to_block_ids = op.attr('grad_to_block_id')

    origin_program = config.get_origin_main_program()
    sparse_varnames = get_sparse_tablenames(origin_program, False)

    for grad_to_block_id in grad_to_block_ids:
        grad, blockid = grad_to_block_id.split(":")
        grad_blockid_map[grad] = int(blockid)

    for grad_to_param in grad_to_params:
        grad, param = grad_to_param.split(":")

        if _orig_varname(param) in sparse_varnames:
            continue

        param_blockid_map[param] = grad_blockid_map[grad]

    if not is_startup:
        for param, blockid in param_blockid_map.items():
            opt_block = program.block(blockid)

            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                get_optimizer_values(opt_block)

            entry_attr = get_entry_attr(param)
            is_entry = False if entry_attr == "none" else True

            if fuse:
                add_fuse_large_scale_op(opt_block,
                                        program.global_block(), param,
                                        value_names, acture_names, grad,
                                        is_entry, opt_idx)
            else:
                add_large_scale_op(opt_block,
                                   program.global_block(), param, value_names,
                                   acture_names, grad, is_entry, opt_idx)
    else:
        large_scale_kv_metas = []
        for param, blockid in param_blockid_map.items():
            opt_block = main_program.block(blockid)

            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                get_optimizer_values(opt_block)

            entry_attr = get_entry_attr(param)

            if fuse:
                # remove origin optimzier op
                opt_block._remove_op(opt_idx)

            # training/infer
            mode = "0"
            names_str = ",".join(value_names)
            dims_str = ",".join([str(dim) for dim in value_dims])
            ids_name = "kSparseIDs@{}".format(param)
            cached_str = ",".join(acture_names + [ids_name])
            init_attr_str = get_initializer_attrs(acture_names)

            meta_str = ":".join([
                param, names_str, dims_str, mode, grad.name, cached_str,
                init_attr_str, entry_attr
            ])
            print("large_scale_metas: {}".format(meta_str))
            large_scale_kv_metas.append(meta_str)

        program.global_block().append_op(
            type="lookup_sparse_table_init",
            inputs=None,
            outputs=None,
            attrs={"large_scale_metas": large_scale_kv_metas})

    # todo: need delete unused var.
    return program