示例#1
0
文件: io.py 项目: zlapp/vissl
def copy_file(input_file, destination_dir, tmp_destination_dir):
    """
    Copy a given input_file from source to the destination directory.

    Steps:
    1. We use g_pathmgr to extract the data to local path.
    2. we simply move the files from the g_pathmgr cached local directory
       to the user specified destination directory. We use rsync.
       How destination dir is chosen:
            a) If user is using slurm, we set destination_dir = slurm_dir (see get_slurm_dir)
            b) If the local path used by PathManafer is same as the input_file path,
               and the destination directory is not specified, we set
               destination_dir = tmp_destination_dir

    Returns:
        output_file (str): the new path of the file
        destination_dir (str): the destination dir that was actually used
    """
    # we first extract the local path for the files. g_pathmgr
    # determines the local path itself and copies data there.
    logging.info(f"Copying {input_file} to local path...")
    out = g_pathmgr.get_local_path(input_file)
    output_dir = os.path.dirname(out)
    logging.info(f"File coped to: {out}")

    if (out == input_file) and not destination_dir:
        destination_dir = tmp_destination_dir
        logging.info(
            f"The file wasn't copied. Copying again to temp "
            f"destination directory: {destination_dir}"
        )
    # if the user wants to copy the files to a specific location,
    # we simply move the files from the g_pathmgr cached directory
    # to the user specified directory.
    destination_dir = get_slurm_dir(destination_dir)
    if "SLURM_JOBID" in os.environ:
        destination_dir = get_slurm_dir(destination_dir)
    if destination_dir is not None:
        makedir(destination_dir)
        output_file = f"{destination_dir}/{os.path.basename(input_file)}"
        if g_pathmgr.exists(output_file):
            logging.info(f"File already copied: {output_file}")
            return output_file, destination_dir

        logging.info(f"Copying file: {input_file} to destination: {destination_dir}")
        stime = time.perf_counter()
        os.system(f"rsync -a --progress {out} {destination_dir}")
        etime = time.perf_counter()
        logging.info(
            f"Copied file | time (sec): {round(etime - stime, 4)} "
            f"size: {get_file_size(output_file)}"
        )
        return output_file, destination_dir
    else:
        return out, output_dir
示例#2
0
def get_local_path(input_file, dest_dir):
    """
    If user specified copying data to a local directory,
    get the local path where the data files were copied.

    - If input_file is just a file, we return the dest_dir/filename
    - If the intput_file is a directory, then we check if the
      environemt is SLURM and use slurm_dir or otherwise dest_dir
      to look up copy_complete file is available.
      If available, we return the directory.
    - If both above fail, we return the input_file as is.
    """
    out = ""
    if PathManager.isfile(input_file):
        out = os.path.join(dest_dir, os.path.basename(input_file))
    elif PathManager.isdir(input_file):
        data_name = input_file.strip("/").split("/")[-1]
        if "SLURM_JOBID" in os.environ:
            dest_dir = get_slurm_dir(dest_dir)
        dest_dir = os.path.join(dest_dir, data_name)
        complete_flag = os.path.join(dest_dir, "copy_complete")
        if PathManager.isfile(complete_flag):
            out = dest_dir
    if PathManager.exists(out):
        return out
    else:
        return input_file
示例#3
0
文件: io.py 项目: iseessel/vissl
def copy_dir(input_dir, destination_dir, num_threads):
    """
    Copy contents of one directory to the specified destination directory
    using the number of threads to speed up the copy. When the data is
    copied successfully, we create a copy_complete file in the
    destination_dir folder to mark the completion. If the destination_dir
    folder already exists and has the copy_complete file, we don't
    copy the file.

    useful for copying datasets like ImageNet to speed up dataloader.
    Using 20 threads for imagenet takes about 20 minutes to copy.

    Returns:
        destination_dir (str): directory where the contents were copied
    """
    # remove the backslash if user added it
    data_name = input_dir.strip("/").split("/")[-1]
    if "SLURM_JOBID" in os.environ:
        destination_dir = get_slurm_dir(destination_dir)
    destination_dir = f"{destination_dir}/{data_name}"
    makedir(destination_dir)
    complete_flag = f"{destination_dir}/copy_complete"
    if PathManager.isfile(complete_flag):
        logging.info(f"Found Data already copied: {destination_dir}...")
        return destination_dir
    logging.info(
        f"Copying {input_dir} to dir {destination_dir} using {num_threads} threads"
    )
    # We have to do multi-threaded rsync to speed up copy.
    cmd = (f"ls -d {input_dir}/* | parallel -j {num_threads} --will-cite "
           f"rsync -ruW --inplace {{}} {destination_dir}")
    os.system(cmd)
    PathManager.open(complete_flag, "a").close()
    logging.info("Copied to local directory")
    return destination_dir, destination_dir