Exemplo n.º 1
0
    def load(self, path: str) -> None:
        """load a checkpoint from `path` and initialize models.

        Args:
            path: A `string` of path to load checkpoint.
        """
        if not config_util.api_legacy_model_io_enabled():
            check_point_v2.LoadVariables(check_point_v2.GetCheckpoint(path))
            return
        assert type(path) is str
        enable_if.unique([lazy_checkpoint_load, eager_checkpoint_load])(path)
Exemplo n.º 2
0
    def save(self, path: str) -> None:
        """save a checkpoint to `path`.

        Args:
            path: A `string` of path to save checkpoint. 
        """
        if not config_util.api_legacy_model_io_enabled():
            check_point_v2.SaveVarDict(path)
            return
        assert type(path) is str
        enable_if.unique([lazy_checkpoint_save, eager_checkpoint_save])(path)
Exemplo n.º 3
0
def api_gpu_device_num(val: int) -> None:
    """Set number of GPUs on each machine to run oneflow on.

    Args:
        val (int): number of GPUs. It is identical on every machine. In other words,
        you can't specify different number of GPUs you would like to use on each machine.
    """
    if oneflow._oneflow_internal.flags.with_cuda():
        return enable_if.unique([gpu_device_num, do_nothing])(val)
    else:
        print(
            "INFO: for CPU-only OneFlow, oneflow.compatible.single_client.config.gpu_device_num is equivalent to oneflow.compatible.single_client.config.cpu_device_num"
        )
        print(traceback.format_stack()[-2])
        return enable_if.unique([cpu_device_num, do_nothing])(val)
Exemplo n.º 4
0
def api_enable_eager_execution(val: bool = True) -> None:
    """If True, job will execute in eager mode, else use lazy mode(static graph).

    Args:
        val (bool, optional): Whether  eager execution or not.  Defaults to True.
    """
    return enable_if.unique([enable_eager_environment])(val)
Exemplo n.º 5
0
def api_log_dir(val: str) -> None:
    """Specify a dir to store OneFlow's logging files. If not specified, it is `./log` by default.

    Args:
        val (str): string , log file path
    """
    return enable_if.unique([log_dir, do_nothing])(val)
Exemplo n.º 6
0
def api_ctrl_port(val: int) -> None:
    """Set port number used to control the execution across multiple machines. Same on every machine.

    Args:
        val: a port number accessible to peer machines
    """
    return enable_if.unique([ctrl_port, do_nothing])(val)
Exemplo n.º 7
0
def BoxingTo(builder, produced_blob_object, consumer_op_arg_parallel_attr):
    hob_context = BoxingHobContext(produced_blob_object,
                                   consumer_op_arg_parallel_attr)
    if enable_if.get_condition_hob(NoBoxing)(hob_context):
        return produced_blob_object
    producer_opt_mirrored_parallel = (
        produced_blob_object.op_arg_parallel_attr.opt_mirrored_parallel)
    consumer_opt_mirrored_parallel = consumer_op_arg_parallel_attr.opt_mirrored_parallel
    assert producer_opt_mirrored_parallel == consumer_opt_mirrored_parallel, (
        "\nproducer_op_arg_parallel_attr: %s\nconsumer_op_arg_parallel_attr: %s"
        % (produced_blob_object.op_arg_parallel_attr,
           consumer_op_arg_parallel_attr))

    def default(get_failed_info, *args, **kwargs):
        raise NotImplementedError(
            "%s\nno boxing method found.\nlogical_blob_name: %s\nx_arg_attribute: %s\nconsumer_op_arg_parallel_attr: %s\n"
            % (
                get_failed_info(),
                produced_blob_object.op_arg_blob_attr.logical_blob_name,
                produced_blob_object.op_arg_parallel_attr,
                consumer_op_arg_parallel_attr,
            ))

    global conditional_function_table
    function = enable_if.unique(
        conditional_function_table,
        context=BoxingHobContext(produced_blob_object,
                                 consumer_op_arg_parallel_attr),
        default=default,
    )
    return function(builder, produced_blob_object,
                    consumer_op_arg_parallel_attr)
Exemplo n.º 8
0
def api_nccl_enable_all_to_all(val: bool) -> None:
    """Whether or not use nccl all2all during s2s boxing

    Args:
        val (bool): True or False
    """
    return enable_if.unique([nccl_enable_all_to_all, do_nothing])(val)
Exemplo n.º 9
0
def api_nccl_use_compute_stream(val: bool = False) -> None:
    """Whether or not nccl use compute stream to reuse nccl memory and speedup

    Args:
        val (bool, optional): True or False. Defaults to False.
    """
    return enable_if.unique([nccl_use_compute_stream, do_nothing])(val=val)
Exemplo n.º 10
0
def api_enable_mem_chain_merge(val: bool = True) -> None:
    """Whether or not to enable MemChain merge.

    Args:
        val (bool, optional): True or False. Defaults to True.
    """
    return enable_if.unique([enable_mem_chain_merge, do_nothing])(val=val)
Exemplo n.º 11
0
def api_enable_tensor_float_32_compute(val: bool = True) -> None:
    """Whether or not to enable Tensor-float-32 on supported GPUs

    Args:
        val (bool, optional): True or False. Defaults to True.
    """
    return enable_if.unique([enable_tensor_float_32_compute, do_nothing])(val=val)
Exemplo n.º 12
0
def api_enable_fusion(val: bool = True) -> None:
    """Whether or not allow fusion the operators

    Args:
        val (bool, optional): True or False. Defaults to True.
    """
    return enable_if.unique([enable_fusion, do_nothing])(val=val)
Exemplo n.º 13
0
def api_enable_model_io_v2(val):
    """Whether or not use version2  of model input/output function.

    Args:
        val ([type]): True or False
    """
    return enable_if.unique([enable_model_io_v2, do_nothing])(val)
Exemplo n.º 14
0
def api_enable_legacy_model_io(val: bool = True):
    """Whether or not use legacy model io.

    Args:
        val ([type]): True or False
    """
    return enable_if.unique([enable_legacy_model_io, do_nothing])(val)
Exemplo n.º 15
0
def api_enable_debug_mode(val: bool) -> None:
    """Whether use debug mode or not.

    Args:
        val (bool):  True or False
    """
    return enable_if.unique([enable_debug_mode, do_nothing])(val)
Exemplo n.º 16
0
def api_nccl_fusion_max_ops(val: int) -> None:
    """Maximum number of ops for nccl fusion.

    Args:
        val (int): Maximum number of ops
    """
    return enable_if.unique([nccl_fusion_max_ops, do_nothing])(val)
Exemplo n.º 17
0
def api_machine_num(val: int) -> None:
    """Set available number of machine/node for  running job .

    Args:
        val (int): available number of machines
    """
    return enable_if.unique([machine_num, do_nothing])(val)
Exemplo n.º 18
0
def api_disable_group_boxing_by_dst_parallel(val: bool = False) -> None:
    """Whether or not disable group boxing by dst parallel pass to reduce boxing memory life cycle.

    Args:
        val (bool, optional): True or False. Defaults to False.
    """
    return enable_if.unique([disable_group_boxing_by_dst_parallel, do_nothing])(val=val)
Exemplo n.º 19
0
def api_nccl_enable_mixed_fusion(val: bool) -> None:
    """Whether or not use nccl mixed fusion

    Args:
        val (bool): True or False
    """
    return enable_if.unique([nccl_enable_mixed_fusion, do_nothing])(val)
Exemplo n.º 20
0
def api_load_library_now(val: str) -> None:
    """Load necessary library for job now

    Args:
        val (str): path to shared object file
    """
    return enable_if.unique([load_library_now, do_nothing])(val)
Exemplo n.º 21
0
def api_cpu_device_num(val: int) -> None:
    """Set number of CPUs on each machine to run oneflow on. Usually you don't need to set this.

    Args:
        val (int): number of CPUs. It is identical on every machine.
    """
    return enable_if.unique([cpu_device_num, do_nothing])(val)
Exemplo n.º 22
0
def api_nccl_num_streams(val: int) -> None:
    """Set up the number of nccl parallel streams while use boxing

    Args:
        val (int): number of streams
    """
    return enable_if.unique([nccl_num_streams, do_nothing])(val)
Exemplo n.º 23
0
def api_get_current_machine_id():
    """Get machine id of current machine/node

    Returns:
        [type]: [description]
    """
    return enable_if.unique([get_current_machine_id])()
Exemplo n.º 24
0
def api_nccl_fusion_threshold_mb(val: int) -> None:
    """Set up threshold for oprators fusion

    Args:
        val (int): int number, e.g. 10(mb)
    """
    return enable_if.unique([nccl_fusion_threshold_mb, do_nothing])(val)
Exemplo n.º 25
0
def api_data_port(val: int) -> None:
    """Set port number used to data transfer among multiple machines. Same on every machine.

    Args:
        val: a port number accessible to peer machines
    """
    return enable_if.unique([data_port, do_nothing])(val)
Exemplo n.º 26
0
def api_nccl_fusion_all_reduce_use_buffer(val: bool) -> None:
    """Whether or not use buffer during nccl fusion progress

    Args:
        val (bool): True or False
    """
    return enable_if.unique([nccl_fusion_all_reduce_use_buffer, do_nothing])(val)
Exemplo n.º 27
0
def api_logtostderr(val: int) -> None:
    """Set whether log messages go to stderr instead of logfiles

    Args:
        val (int): [description]
    """
    return enable_if.unique([logtostderr, do_nothing])(val)
Exemplo n.º 28
0
def api_nccl_fusion_all_gather(val: bool) -> None:
    """Whether or not use nccl fusion during all  gather progress

    Args:
        val (bool): True or False
    """
    return enable_if.unique([nccl_fusion_all_gather, do_nothing])(val)
Exemplo n.º 29
0
def api_env_init() -> bool:
    """Init environment for job

    Returns:
        bool: [description]
    """
    return enable_if.unique([_env_init_single_client, do_nothing])()
Exemplo n.º 30
0
def api_nccl_fusion_broadcast(val: bool) -> None:
    """Whether or not use nccl fusion during broadcast progress

    Args:
        val (bool): True or False
    """
    return enable_if.unique([nccl_fusion_broadcast, do_nothing])(val)