def load(self, path: str) -> None: """load a checkpoint from `path` and initialize models. Args: path: A `string` of path to load checkpoint. """ if not config_util.api_legacy_model_io_enabled(): check_point_v2.LoadVariables(check_point_v2.GetCheckpoint(path)) return assert type(path) is str enable_if.unique([lazy_checkpoint_load, eager_checkpoint_load])(path)
def save(self, path: str) -> None: """save a checkpoint to `path`. Args: path: A `string` of path to save checkpoint. """ if not config_util.api_legacy_model_io_enabled(): check_point_v2.SaveVarDict(path) return assert type(path) is str enable_if.unique([lazy_checkpoint_save, eager_checkpoint_save])(path)
def api_gpu_device_num(val: int) -> None: """Set number of GPUs on each machine to run oneflow on. Args: val (int): number of GPUs. It is identical on every machine. In other words, you can't specify different number of GPUs you would like to use on each machine. """ if oneflow._oneflow_internal.flags.with_cuda(): return enable_if.unique([gpu_device_num, do_nothing])(val) else: print( "INFO: for CPU-only OneFlow, oneflow.compatible.single_client.config.gpu_device_num is equivalent to oneflow.compatible.single_client.config.cpu_device_num" ) print(traceback.format_stack()[-2]) return enable_if.unique([cpu_device_num, do_nothing])(val)
def api_enable_eager_execution(val: bool = True) -> None: """If True, job will execute in eager mode, else use lazy mode(static graph). Args: val (bool, optional): Whether eager execution or not. Defaults to True. """ return enable_if.unique([enable_eager_environment])(val)
def api_log_dir(val: str) -> None: """Specify a dir to store OneFlow's logging files. If not specified, it is `./log` by default. Args: val (str): string , log file path """ return enable_if.unique([log_dir, do_nothing])(val)
def api_ctrl_port(val: int) -> None: """Set port number used to control the execution across multiple machines. Same on every machine. Args: val: a port number accessible to peer machines """ return enable_if.unique([ctrl_port, do_nothing])(val)
def BoxingTo(builder, produced_blob_object, consumer_op_arg_parallel_attr): hob_context = BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr) if enable_if.get_condition_hob(NoBoxing)(hob_context): return produced_blob_object producer_opt_mirrored_parallel = ( produced_blob_object.op_arg_parallel_attr.opt_mirrored_parallel) consumer_opt_mirrored_parallel = consumer_op_arg_parallel_attr.opt_mirrored_parallel assert producer_opt_mirrored_parallel == consumer_opt_mirrored_parallel, ( "\nproducer_op_arg_parallel_attr: %s\nconsumer_op_arg_parallel_attr: %s" % (produced_blob_object.op_arg_parallel_attr, consumer_op_arg_parallel_attr)) def default(get_failed_info, *args, **kwargs): raise NotImplementedError( "%s\nno boxing method found.\nlogical_blob_name: %s\nx_arg_attribute: %s\nconsumer_op_arg_parallel_attr: %s\n" % ( get_failed_info(), produced_blob_object.op_arg_blob_attr.logical_blob_name, produced_blob_object.op_arg_parallel_attr, consumer_op_arg_parallel_attr, )) global conditional_function_table function = enable_if.unique( conditional_function_table, context=BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr), default=default, ) return function(builder, produced_blob_object, consumer_op_arg_parallel_attr)
def api_nccl_enable_all_to_all(val: bool) -> None: """Whether or not use nccl all2all during s2s boxing Args: val (bool): True or False """ return enable_if.unique([nccl_enable_all_to_all, do_nothing])(val)
def api_nccl_use_compute_stream(val: bool = False) -> None: """Whether or not nccl use compute stream to reuse nccl memory and speedup Args: val (bool, optional): True or False. Defaults to False. """ return enable_if.unique([nccl_use_compute_stream, do_nothing])(val=val)
def api_enable_mem_chain_merge(val: bool = True) -> None: """Whether or not to enable MemChain merge. Args: val (bool, optional): True or False. Defaults to True. """ return enable_if.unique([enable_mem_chain_merge, do_nothing])(val=val)
def api_enable_tensor_float_32_compute(val: bool = True) -> None: """Whether or not to enable Tensor-float-32 on supported GPUs Args: val (bool, optional): True or False. Defaults to True. """ return enable_if.unique([enable_tensor_float_32_compute, do_nothing])(val=val)
def api_enable_fusion(val: bool = True) -> None: """Whether or not allow fusion the operators Args: val (bool, optional): True or False. Defaults to True. """ return enable_if.unique([enable_fusion, do_nothing])(val=val)
def api_enable_model_io_v2(val): """Whether or not use version2 of model input/output function. Args: val ([type]): True or False """ return enable_if.unique([enable_model_io_v2, do_nothing])(val)
def api_enable_legacy_model_io(val: bool = True): """Whether or not use legacy model io. Args: val ([type]): True or False """ return enable_if.unique([enable_legacy_model_io, do_nothing])(val)
def api_enable_debug_mode(val: bool) -> None: """Whether use debug mode or not. Args: val (bool): True or False """ return enable_if.unique([enable_debug_mode, do_nothing])(val)
def api_nccl_fusion_max_ops(val: int) -> None: """Maximum number of ops for nccl fusion. Args: val (int): Maximum number of ops """ return enable_if.unique([nccl_fusion_max_ops, do_nothing])(val)
def api_machine_num(val: int) -> None: """Set available number of machine/node for running job . Args: val (int): available number of machines """ return enable_if.unique([machine_num, do_nothing])(val)
def api_disable_group_boxing_by_dst_parallel(val: bool = False) -> None: """Whether or not disable group boxing by dst parallel pass to reduce boxing memory life cycle. Args: val (bool, optional): True or False. Defaults to False. """ return enable_if.unique([disable_group_boxing_by_dst_parallel, do_nothing])(val=val)
def api_nccl_enable_mixed_fusion(val: bool) -> None: """Whether or not use nccl mixed fusion Args: val (bool): True or False """ return enable_if.unique([nccl_enable_mixed_fusion, do_nothing])(val)
def api_load_library_now(val: str) -> None: """Load necessary library for job now Args: val (str): path to shared object file """ return enable_if.unique([load_library_now, do_nothing])(val)
def api_cpu_device_num(val: int) -> None: """Set number of CPUs on each machine to run oneflow on. Usually you don't need to set this. Args: val (int): number of CPUs. It is identical on every machine. """ return enable_if.unique([cpu_device_num, do_nothing])(val)
def api_nccl_num_streams(val: int) -> None: """Set up the number of nccl parallel streams while use boxing Args: val (int): number of streams """ return enable_if.unique([nccl_num_streams, do_nothing])(val)
def api_get_current_machine_id(): """Get machine id of current machine/node Returns: [type]: [description] """ return enable_if.unique([get_current_machine_id])()
def api_nccl_fusion_threshold_mb(val: int) -> None: """Set up threshold for oprators fusion Args: val (int): int number, e.g. 10(mb) """ return enable_if.unique([nccl_fusion_threshold_mb, do_nothing])(val)
def api_data_port(val: int) -> None: """Set port number used to data transfer among multiple machines. Same on every machine. Args: val: a port number accessible to peer machines """ return enable_if.unique([data_port, do_nothing])(val)
def api_nccl_fusion_all_reduce_use_buffer(val: bool) -> None: """Whether or not use buffer during nccl fusion progress Args: val (bool): True or False """ return enable_if.unique([nccl_fusion_all_reduce_use_buffer, do_nothing])(val)
def api_logtostderr(val: int) -> None: """Set whether log messages go to stderr instead of logfiles Args: val (int): [description] """ return enable_if.unique([logtostderr, do_nothing])(val)
def api_nccl_fusion_all_gather(val: bool) -> None: """Whether or not use nccl fusion during all gather progress Args: val (bool): True or False """ return enable_if.unique([nccl_fusion_all_gather, do_nothing])(val)
def api_env_init() -> bool: """Init environment for job Returns: bool: [description] """ return enable_if.unique([_env_init_single_client, do_nothing])()
def api_nccl_fusion_broadcast(val: bool) -> None: """Whether or not use nccl fusion during broadcast progress Args: val (bool): True or False """ return enable_if.unique([nccl_fusion_broadcast, do_nothing])(val)