def copy_file(input_file, destination_dir, tmp_destination_dir): """ Copy a given input_file from source to the destination directory. Steps: 1. We use g_pathmgr to extract the data to local path. 2. we simply move the files from the g_pathmgr cached local directory to the user specified destination directory. We use rsync. How destination dir is chosen: a) If user is using slurm, we set destination_dir = slurm_dir (see get_slurm_dir) b) If the local path used by PathManafer is same as the input_file path, and the destination directory is not specified, we set destination_dir = tmp_destination_dir Returns: output_file (str): the new path of the file destination_dir (str): the destination dir that was actually used """ # we first extract the local path for the files. g_pathmgr # determines the local path itself and copies data there. logging.info(f"Copying {input_file} to local path...") out = g_pathmgr.get_local_path(input_file) output_dir = os.path.dirname(out) logging.info(f"File coped to: {out}") if (out == input_file) and not destination_dir: destination_dir = tmp_destination_dir logging.info( f"The file wasn't copied. Copying again to temp " f"destination directory: {destination_dir}" ) # if the user wants to copy the files to a specific location, # we simply move the files from the g_pathmgr cached directory # to the user specified directory. destination_dir = get_slurm_dir(destination_dir) if "SLURM_JOBID" in os.environ: destination_dir = get_slurm_dir(destination_dir) if destination_dir is not None: makedir(destination_dir) output_file = f"{destination_dir}/{os.path.basename(input_file)}" if g_pathmgr.exists(output_file): logging.info(f"File already copied: {output_file}") return output_file, destination_dir logging.info(f"Copying file: {input_file} to destination: {destination_dir}") stime = time.perf_counter() os.system(f"rsync -a --progress {out} {destination_dir}") etime = time.perf_counter() logging.info( f"Copied file | time (sec): {round(etime - stime, 4)} " f"size: {get_file_size(output_file)}" ) return output_file, destination_dir else: return out, output_dir
def get_local_path(input_file, dest_dir): """ If user specified copying data to a local directory, get the local path where the data files were copied. - If input_file is just a file, we return the dest_dir/filename - If the intput_file is a directory, then we check if the environemt is SLURM and use slurm_dir or otherwise dest_dir to look up copy_complete file is available. If available, we return the directory. - If both above fail, we return the input_file as is. """ out = "" if PathManager.isfile(input_file): out = os.path.join(dest_dir, os.path.basename(input_file)) elif PathManager.isdir(input_file): data_name = input_file.strip("/").split("/")[-1] if "SLURM_JOBID" in os.environ: dest_dir = get_slurm_dir(dest_dir) dest_dir = os.path.join(dest_dir, data_name) complete_flag = os.path.join(dest_dir, "copy_complete") if PathManager.isfile(complete_flag): out = dest_dir if PathManager.exists(out): return out else: return input_file
def copy_dir(input_dir, destination_dir, num_threads): """ Copy contents of one directory to the specified destination directory using the number of threads to speed up the copy. When the data is copied successfully, we create a copy_complete file in the destination_dir folder to mark the completion. If the destination_dir folder already exists and has the copy_complete file, we don't copy the file. useful for copying datasets like ImageNet to speed up dataloader. Using 20 threads for imagenet takes about 20 minutes to copy. Returns: destination_dir (str): directory where the contents were copied """ # remove the backslash if user added it data_name = input_dir.strip("/").split("/")[-1] if "SLURM_JOBID" in os.environ: destination_dir = get_slurm_dir(destination_dir) destination_dir = f"{destination_dir}/{data_name}" makedir(destination_dir) complete_flag = f"{destination_dir}/copy_complete" if PathManager.isfile(complete_flag): logging.info(f"Found Data already copied: {destination_dir}...") return destination_dir logging.info( f"Copying {input_dir} to dir {destination_dir} using {num_threads} threads" ) # We have to do multi-threaded rsync to speed up copy. cmd = (f"ls -d {input_dir}/* | parallel -j {num_threads} --will-cite " f"rsync -ruW --inplace {{}} {destination_dir}") os.system(cmd) PathManager.open(complete_flag, "a").close() logging.info("Copied to local directory") return destination_dir, destination_dir