def _process(dest_tree, src_tree, collected_files, download=False): from dvc.cache.local import _log_exceptions from_infos = [] to_infos = [] names = [] for from_info in collected_files: from_infos.append(from_info) fname = from_info.relative_to(src_tree.path_info) names.append(str(fname)) to_infos.append(dest_tree.path_info / fname) total = len(from_infos) if download: func = partial( _log_exceptions(src_tree.download, "download"), dir_mode=dest_tree.dir_mode, file_mode=dest_tree.file_mode, ) desc = "Downloading" else: func = partial(_log_exceptions(dest_tree.upload, "upload")) desc = "Uploading" with Tqdm(total=total, unit="file", desc=desc) as pbar: func = pbar.wrap_fn(func) # TODO: parallelize this, currently --jobs for repro applies to # number of repro executors not download threads with ThreadPoolExecutor(max_workers=1) as dl_executor: fails = sum(dl_executor.map(func, from_infos, to_infos, names)) if fails: if download: raise DownloadError(fails) raise UploadError(fails)
def _upload_plans( self, dir_plans, file_plans, dir_contents, missing_files, processor ): total_fails = 0 succeeded_dir_hashes = [] all_file_plans = list(zip(*file_plans)) for dir_from_info, dir_to_info, dir_name, dir_hash in zip(*dir_plans): bound_file_plans = [] directory_hashes = dir_contents[dir_hash] for file_plan in all_file_plans.copy(): if file_plan[-1] in directory_hashes: bound_file_plans.append(file_plan) all_file_plans.remove(file_plan) dir_fails = processor(bound_file_plans) if dir_fails: logger.debug( "failed to upload full contents of '{}', " "aborting .dir file upload".format(dir_name) ) logger.error( f"failed to upload '{dir_from_info}'" f" to '{dir_to_info}'" ) total_fails += dir_fails + 1 elif directory_hashes.intersection(missing_files): # if for some reason a file contained in this dir is # missing both locally and in the remote, we want to # push whatever file content we have, but should not # push .dir file logger.debug( "directory '%s' contains missing files," "skipping .dir file upload", dir_name, ) else: is_dir_failed = processor.transfer_func( dir_from_info, dir_to_info, dir_name ) total_fails += is_dir_failed if not is_dir_failed: succeeded_dir_hashes.append(dir_hash) # insert the rest total_fails += processor(all_file_plans) if total_fails: raise UploadError(total_fails) # index successfully pushed dirs for dir_hash in succeeded_dir_hashes: file_hashes = dir_contents[dir_hash] logger.debug( "Indexing pushed dir '{}' with " "'{}' nested files".format(dir_hash, len(file_hashes)) ) self.index.update([dir_hash], file_hashes)
def _process( self, named_cache, remote, jobs=None, show_checksums=False, download=False, ): logger.debug( "Preparing to {} '{}'".format( "download data from" if download else "upload data to", remote.path_info, ) ) if download: func = partial( remote.download, dir_mode=self._dir_mode, file_mode=self._file_mode, ) status = STATUS_DELETED else: func = remote.upload status = STATUS_NEW if jobs is None: jobs = remote.JOBS status_info = self.status( named_cache, remote, jobs=jobs, show_checksums=show_checksums, download=download, ) plans = self._get_plans(download, remote, status_info, status) if len(plans[0]) == 0: return 0 if jobs > 1: with ThreadPoolExecutor(max_workers=jobs) as executor: fails = sum(executor.map(func, *plans)) else: fails = sum(map(func, *plans)) if fails: if download: raise DownloadError(fails) raise UploadError(fails) return len(plans[0])
def _process(dest_tree, src_tree, collected_files, download=False): from dvc.remote.base import _log_exceptions from_infos = [] to_infos = [] names = [] for from_info in collected_files: from_infos.append(from_info) fname = from_info.relative_to(src_tree.path_info) names.append(str(fname)) to_infos.append(dest_tree.path_info / fname) total = len(from_infos) if download: func = partial( _log_exceptions(src_tree.download, "download"), dir_mode=dest_tree.dir_mode, ) desc = "Downloading" else: func = partial(_log_exceptions(dest_tree.upload, "upload")) desc = "Uploading" with Tqdm(total=total, unit="file", desc=desc) as pbar: func = pbar.wrap_fn(func) # TODO: parallelize this, currently --jobs for repro applies to # number of repro executors not download threads with ThreadPoolExecutor(max_workers=1) as dl_executor: mode = None stat_func = getattr(src_tree, "stat", None) futures = [] for from_info, to_info, name in zip( from_infos, to_infos, names ): if stat_func: mode = stat.S_IMODE(stat_func(from_info).st_mode) futures.append( dl_executor.submit( func, from_info, to_info, name, file_mode=mode ) ) fails = sum( future.result() for future in as_completed(futures) ) if fails: if download: raise DownloadError(fails) raise UploadError(fails)
def push( self, targets=None, jobs=None, remote=None, all_branches=False, with_deps=False, all_tags=False, recursive=False, all_commits=False, run_cache=False, revs=None, glob=False, ): used_run_cache = self.stage_cache.push(remote) if run_cache else [] if isinstance(targets, str): targets = [targets] expanded_targets = glob_targets(targets, glob=glob) used = self.used_objs( expanded_targets, all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, with_deps=with_deps, force=True, remote=remote, jobs=jobs, recursive=recursive, used_run_cache=used_run_cache, revs=revs, ) pushed = len(used_run_cache) for odb, objs in used.items(): if odb and odb.read_only: continue try: pushed += self.cloud.push(objs, jobs, remote=remote, odb=odb) except FileTransferError as exc: raise UploadError(exc.amount) return pushed
def _process( self, named_cache, remote, jobs=None, show_checksums=False, download=False, ): logger.debug("Preparing to {} '{}'".format( "download data from" if download else "upload data to", remote.tree.path_info, )) if download: func = partial( _log_exceptions(remote.tree.download, "download"), dir_mode=self.tree.dir_mode, file_mode=self.tree.file_mode, ) status = STATUS_DELETED desc = "Downloading" else: func = _log_exceptions(remote.tree.upload, "upload") status = STATUS_NEW desc = "Uploading" if jobs is None: jobs = remote.tree.JOBS dir_status, file_status, dir_contents = self._status( named_cache, remote, jobs=jobs, show_checksums=show_checksums, download=download, ) dir_plans, _ = self._get_plans(download, remote, dir_status, status) file_plans, missing_files = self._get_plans(download, remote, file_status, status) total = len(dir_plans[0]) + len(file_plans[0]) if total == 0: return 0 with Tqdm(total=total, unit="file", desc=desc) as pbar: func = pbar.wrap_fn(func) with ThreadPoolExecutor(max_workers=jobs) as executor: if download: from_infos, to_infos, names, _ = ( d + f for d, f in zip(dir_plans, file_plans)) fails = sum(executor.map(func, from_infos, to_infos, names)) else: # for uploads, push files first, and any .dir files last file_futures = {} for from_info, to_info, name, hash_ in zip(*file_plans): file_futures[hash_] = executor.submit( func, from_info, to_info, name) dir_futures = {} for from_info, to_info, name, dir_hash in zip(*dir_plans): # if for some reason a file contained in this dir is # missing both locally and in the remote, we want to # push whatever file content we have, but should not # push .dir file for file_hash in missing_files: if file_hash in dir_contents[dir_hash]: logger.debug( "directory '%s' contains missing files," "skipping .dir file upload", name, ) break else: wait_futures = { future for file_hash, future in file_futures.items() if file_hash in dir_contents[dir_hash] } dir_futures[dir_hash] = executor.submit( self._dir_upload, func, wait_futures, from_info, to_info, name, ) fails = sum(future.result() for future in concat( file_futures.values(), dir_futures.values())) if fails: if download: remote.index.clear() raise DownloadError(fails) raise UploadError(fails) if not download: # index successfully pushed dirs for dir_hash, future in dir_futures.items(): if future.result() == 0: file_hashes = dir_contents[dir_hash] logger.debug("Indexing pushed dir '{}' with " "'{}' nested files".format( dir_hash, len(file_hashes))) remote.index.update([dir_hash], file_hashes) return len(dir_plans[0]) + len(file_plans[0])
def _upload_plans( self, dir_plans, file_plans, dir_contents, missing_files, executor, jobs, func, ): total_fails = 0 def insert_batched(file_plans): fails = 0 file_plan_iterator = iter(file_plans) def create_taskset(amount): return { executor.submit(func, from_info, to_info, name) for from_info, to_info, name, _ in itertools.islice( file_plan_iterator, amount) } tasks = create_taskset(jobs * 5) while tasks: done, tasks = futures.wait(tasks, return_when=futures.FIRST_COMPLETED) fails += sum(task.result() for task in done) tasks.update(create_taskset(len(done))) return fails succeeded_dir_hashes = [] all_file_plans = list(zip(*file_plans)) for dir_from_info, dir_to_info, dir_name, dir_hash in zip(*dir_plans): bound_file_plans = [] directory_hashes = dir_contents[dir_hash] for file_plan in all_file_plans.copy(): if file_plan[-1] in directory_hashes: bound_file_plans.append(file_plan) all_file_plans.remove(file_plan) dir_fails = insert_batched(bound_file_plans) if dir_fails: logger.debug("failed to upload full contents of '{}', " "aborting .dir file upload".format(dir_name)) logger.error(f"failed to upload '{dir_from_info}'" f" to '{dir_to_info}'") total_fails += dir_fails + 1 elif directory_hashes.intersection(missing_files): # if for some reason a file contained in this dir is # missing both locally and in the remote, we want to # push whatever file content we have, but should not # push .dir file logger.debug( "directory '%s' contains missing files," "skipping .dir file upload", dir_name, ) else: is_dir_failed = func(dir_from_info, dir_to_info, dir_name) total_fails += is_dir_failed if not is_dir_failed: succeeded_dir_hashes.append(dir_hash) # insert the rest total_fails += insert_batched(all_file_plans) if total_fails: raise UploadError(total_fails) # index successfully pushed dirs for dir_hash in succeeded_dir_hashes: file_hashes = dir_contents[dir_hash] logger.debug("Indexing pushed dir '{}' with " "'{}' nested files".format(dir_hash, len(file_hashes))) self.index.update([dir_hash], file_hashes)
def push( self, targets=None, jobs=None, remote=None, all_branches=False, with_deps=False, all_tags=False, recursive=False, all_commits=False, run_cache=False, revs=None, glob=False, odb: Optional["ObjectDB"] = None, include_imports=False, ): used_run_cache = (self.stage_cache.push(remote, odb=odb) if run_cache else []) if isinstance(targets, str): targets = [targets] expanded_targets = glob_targets(targets, glob=glob) used = self.used_objs( expanded_targets, all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, with_deps=with_deps, force=True, remote=remote, jobs=jobs, recursive=recursive, used_run_cache=used_run_cache, revs=revs, ) pushed = len(used_run_cache) if odb: all_ids = set() for dest_odb, obj_ids in used.items(): if not include_imports and dest_odb and dest_odb.read_only: continue all_ids.update(obj_ids) try: pushed += self.cloud.push(all_ids, jobs, remote=remote, odb=odb) except FileTransferError as exc: raise UploadError(exc.amount) else: for dest_odb, obj_ids in used.items(): if dest_odb and dest_odb.read_only: continue try: pushed += self.cloud.push(obj_ids, jobs, remote=remote, odb=odb or dest_odb) except FileTransferError as exc: raise UploadError(exc.amount) return pushed
def _process( self, named_cache, remote, jobs=None, show_checksums=False, download=False, ): logger.debug( "Preparing to {} '{}'".format( "download data from" if download else "upload data to", remote.path_info, ) ) if download: func = partial( remote.download, dir_mode=self._dir_mode, file_mode=self._file_mode, ) status = STATUS_DELETED desc = "Downloading" else: func = remote.upload status = STATUS_NEW desc = "Uploading" if jobs is None: jobs = remote.JOBS dir_status, file_status, dir_contents = self._status( named_cache, remote, jobs=jobs, show_checksums=show_checksums, download=download, ) dir_plans = self._get_plans(download, remote, dir_status, status) file_plans = self._get_plans(download, remote, file_status, status) total = len(dir_plans[0]) + len(file_plans[0]) if total == 0: return 0 with Tqdm(total=total, unit="file", desc=desc) as pbar: func = pbar.wrap_fn(func) with ThreadPoolExecutor(max_workers=jobs) as executor: if download: fails = sum(executor.map(func, *dir_plans)) fails += sum(executor.map(func, *file_plans)) else: # for uploads, push files first, and any .dir files last file_futures = {} for from_info, to_info, name, checksum in zip(*file_plans): file_futures[checksum] = executor.submit( func, from_info, to_info, name ) dir_futures = {} for from_info, to_info, name, dir_checksum in zip( *dir_plans ): wait_futures = { future for file_checksum, future in file_futures.items() if file_checksum in dir_contents[dir_checksum] } dir_futures[dir_checksum] = executor.submit( self._dir_upload, func, wait_futures, from_info, to_info, name, ) fails = sum( future.result() for future in concat( file_futures.values(), dir_futures.values() ) ) if fails: if download: remote.index.clear() raise DownloadError(fails) raise UploadError(fails) if not download: # index successfully pushed dirs for dir_checksum, future in dir_futures.items(): if future.result() == 0: file_checksums = dir_contents[dir_checksum] logger.debug( "Indexing pushed dir '{}' with " "'{}' nested files".format( dir_checksum, len(file_checksums) ) ) remote.index.update([dir_checksum], file_checksums) return len(dir_plans[0]) + len(file_plans[0])
def _process( self, named_cache, remote, jobs=None, show_checksums=False, download=False, ): logger.debug("Preparing to {} '{}'".format( "download data from" if download else "upload data to", remote.path_info, )) if download: func = partial( remote.download, dir_mode=self._dir_mode, file_mode=self._file_mode, ) status = STATUS_DELETED else: func = remote.upload status = STATUS_NEW if jobs is None: jobs = remote.JOBS dir_status, file_status, dir_paths = self._status( named_cache, remote, jobs=jobs, show_checksums=show_checksums, download=download, ) dir_plans = self._get_plans(download, remote, dir_status, status) file_plans = self._get_plans(download, remote, file_status, status) if len(dir_plans[0]) + len(file_plans[0]) == 0: return 0 with ThreadPoolExecutor(max_workers=jobs) as executor: if download: fails = sum(executor.map(func, *dir_plans)) fails += sum(executor.map(func, *file_plans)) else: # for uploads, push files first, and any .dir files last file_futures = {} for from_info, to_info, name in zip(*file_plans): file_futures[to_info] = executor.submit( func, from_info, to_info, name) dir_futures = {} for from_info, to_info, name in zip(*dir_plans): wait_futures = { future for file_path, future in file_futures.items() if file_path in dir_paths[to_info] } dir_futures[to_info] = executor.submit( self._dir_upload, func, wait_futures, from_info, to_info, name, ) fails = sum(future.result() for future in concat( file_futures.values(), dir_futures.values())) if fails: if download: raise DownloadError(fails) raise UploadError(fails) return len(dir_plans[0]) + len(file_plans[0])