def _get_chunks(self, download, remote, status_info, status, jobs): title = "Analysing status." progress.set_n_total(1) total = len(status_info) current = 0 cache = [] path_infos = [] names = [] for md5, info in status_info.items(): if info["status"] == status: cache.append(self.checksum_to_path_info(md5)) path_infos.append(remote.checksum_to_path_info(md5)) names.append(info["name"]) current += 1 progress.update_target(title, current, total) progress.finish_target(title) progress.set_n_total(len(names)) if download: to_infos = cache from_infos = path_infos else: to_infos = path_infos from_infos = cache return list( zip( to_chunks(from_infos, jobs), to_chunks(to_infos, jobs), to_chunks(names, jobs), ))
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): logger.info("Preparing to collect status from {}".format(remote.url)) title = "Collecting information" ret = {} progress.set_n_total(1) progress.update_target(title, 0, 100) progress.update_target(title, 10, 100) ret = self._group(checksum_infos, show_checksums=show_checksums) md5s = list(ret.keys()) progress.update_target(title, 30, 100) remote_exists = list(remote.cache_exists(md5s)) progress.update_target(title, 90, 100) local_exists = self.cache_exists(md5s) progress.finish_target(title) for md5, info in ret.items(): info["status"] = STATUS_MAP[ (md5 in local_exists, md5 in remote_exists) ] return ret
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): Logger.info("Preparing to pull data from {}".format(remote.url)) title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) checksum_infos, missing = self._collect(checksum_infos) checksum_infos += missing progress.update_target(title, 10, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) progress.update_target(title, 20, 100) path_infos = remote.md5s_to_path_infos(md5s) progress.update_target(title, 30, 100) remote_exists = remote.exists(path_infos) progress.update_target(title, 90, 100) local_exists = [not self.changed_cache_file(md5) for md5 in md5s] progress.finish_target(title) return [(name, STATUS_MAP[l, r]) for name, l, r in zip(names, local_exists, remote_exists)]
def test_progress_awareness(self, mocker, capsys, caplog): from dvc.progress import progress with mocker.patch("sys.stdout.isatty", return_value=True): progress.set_n_total(100) progress.update_target("progress", 1, 10) # logging an invisible message should not break # the progress bar output with caplog.at_level(logging.INFO, logger="dvc"): debug_record = logging.LogRecord( name="dvc", level=logging.DEBUG, pathname=__name__, lineno=1, msg="debug", args=(), exc_info=None, ) formatter.format(debug_record) captured = capsys.readouterr() assert "\n" not in captured.out # just when the message is actually visible with caplog.at_level(logging.INFO, logger="dvc"): logger.info("some info") captured = capsys.readouterr() assert "\n" in captured.out
def _do_pull(self, checksum_infos, remote, jobs=1, no_progress_bar=False): md5s = [info[self.PARAM_MD5] for info in checksum_infos] # NOTE: filter files that are not corrupted md5s = list(filter(lambda md5: self.changed(md5), md5s)) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(md5s, jobs))) progress.set_n_total(len(md5s)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for from_infos, to_infos, md5s in chunks: res = executor.submit(remote.download, from_infos, to_infos, names=md5s, no_progress_bar=no_progress_bar) futures.append(res) for f in futures: f.result()
def push(self, checksum_infos, remote, jobs=1): md5s = [ info[self.PARAM_MD5] for info in self._collect(checksum_infos)[0] ] # NOTE: verifying that our cache is not corrupted md5s = list(filter(lambda md5: not self.changed(md5), md5s)) # NOTE: filter files that are already uploaded path_infos = remote.md5s_to_path_infos(md5s) md5s_exist = filter(lambda x: not x[1], list(zip(md5s, remote.exists(path_infos)))) md5s = [md5 for md5, exists in md5s_exist] cache = [self.get(md5) for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(md5s, jobs))) progress.set_n_total(len(md5s)) if len(chunks) == 0: return with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for path_infos, paths, md5s in chunks: executor.submit(remote.upload, paths, path_infos, names=md5s)
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): logger.info("Preparing to collect status from {}".format(remote.url)) title = "Collecting information" ret = {} progress.set_n_total(1) progress.update_target(title, 0, 100) progress.update_target(title, 10, 100) ret = self._group(checksum_infos, show_checksums=show_checksums) md5s = list(ret.keys()) progress.update_target(title, 30, 100) remote_exists = list(remote.cache_exists(md5s)) progress.update_target(title, 90, 100) local_exists = self.cache_exists(md5s) progress.finish_target(title) self._fill_statuses(ret, local_exists, remote_exists) self._log_missing_caches(ret) return ret
def _do_pull(self, checksum_infos, remote, jobs=1, show_checksums=False): title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) grouped = zip( *self._group(checksum_infos, show_checksums=show_checksums)) progress.update_target(title, 10, 100) md5s = [] names = [] # NOTE: filter files that are not corrupted for md5, name in grouped: if self.changed_cache(md5): md5s.append(md5) names.append(name) progress.update_target(title, 30, 100) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] progress.update_target(title, 50, 100) path_infos = remote.md5s_to_path_infos(md5s) progress.update_target(title, 60, 100) # NOTE: dummy call to try to establish a connection # to see if we need to ask user for a password. remote.exists(remote.md5s_to_path_infos(['000'])) progress.update_target(title, 70, 100) assert len(path_infos) == len(cache) == len(md5s) == len(names) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.finish_target(title) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for from_infos, to_infos, names in chunks: res = executor.submit(remote.download, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()
def map_progress(func, targets, n_threads): """ Process targets in multi-threaded mode with progress bar """ progress.set_n_total(len(targets)) p = ThreadPool(processes=n_threads) try: p.map(func, targets) except Exception as exc: Logger.error('Unexpected exception while processing targets: {}'.format(exc)) finally: progress.finish()
def map_progress(func, targets, n_threads): """ Process targets in multi-threaded mode with progress bar """ progress.set_n_total(len(targets)) pool = ThreadPool(processes=n_threads) ret = [] wrapper = lambda t: wrap(func, t) try: ret = pool.map(wrapper, targets) except Exception as exc: raise return list(zip(targets, ret))
def map_progress(func, targets, n_threads): """ Process targets in multi-threaded mode with progress bar """ progress.set_n_total(len(targets)) pool = ThreadPool(processes=n_threads) ret = [] try: ret = pool.map(func, targets) except Exception as exc: raise finally: progress.finish() return list(zip(targets, ret))
def status( self, checksum_infos, remote, jobs=None, show_checksums=False, download=False, ): logger.info( "Preparing to collect status from {}".format(remote.path_info) ) title = "Collecting information" ret = {} progress.set_n_total(1) progress.update_target(title, 0, 100) progress.update_target(title, 10, 100) ret = self._group(checksum_infos, show_checksums=show_checksums) md5s = list(ret) progress.update_target(title, 30, 100) local_exists = self.cache_exists(md5s) progress.update_target(title, 40, 100) # This is a performance optimization. We can safely assume that, # if the resources that we want to fetch are already cached, # there's no need to check the remote storage for the existance of # those files. if download and sorted(local_exists) == sorted(md5s): remote_exists = local_exists else: remote_exists = list(remote.cache_exists(md5s)) progress.update_target(title, 90, 100) progress.finish_target(title) self._fill_statuses(ret, local_exists, remote_exists) self._log_missing_caches(ret) return ret
def push(self, checksum_infos, remote, jobs=1, show_checksums=False): checksum_infos = self._collect(checksum_infos)[0] # NOTE: verifying that our cache is not corrupted def func(info): return not self.changed(info[self.PARAM_MD5]) checksum_infos = list(filter(func, checksum_infos)) # NOTE: filter files that are already uploaded md5s = [i[self.PARAM_MD5] for i in checksum_infos] exists = remote.exists(remote.md5s_to_path_infos(md5s)) def func(entry): return not entry[0] assert len(exists) == len(checksum_infos) infos_exist = list(filter(func, zip(exists, checksum_infos))) checksum_infos = [i for e, i in infos_exist] md5s, names = self._group(checksum_infos, show_checksums=show_checksums) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) chunks = list(zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, names in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()
def map_progress(func, targets, n_threads): """ Process targets in multi-threaded mode with progress bar """ progress.set_n_total(len(targets)) pool = ThreadPool(processes=n_threads) ret = [] try: ret = pool.map(func, targets) except Exception as exc: Logger.error( 'Unexpected exception while processing targets: {}'.format(exc), exc_info=True) finally: progress.finish() return list(zip(targets, ret))
def _do_pull(self, checksum_infos, remote, jobs=1, no_progress_bar=False, show_checksums=False): md5s = [] names = [] # NOTE: filter files that are not corrupted for md5, name in zip(*self._group(checksum_infos, show_checksums=show_checksums)): if self.changed(md5): md5s.append(md5) names.append(name) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) chunks = list(zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for from_infos, to_infos, names in chunks: res = executor.submit(remote.download, from_infos, to_infos, names=names, no_progress_bar=no_progress_bar) futures.append(res) for f in futures: f.result()
def push(self, checksum_infos, remote, jobs=1): checksum_infos = self._collect(checksum_infos)[0] md5s = [info[self.PARAM_MD5] for info in checksum_infos] # NOTE: verifying that our cache is not corrupted md5s = list(filter(lambda md5: not self.changed(md5), md5s)) # NOTE: filter files that are already uploaded path_infos = remote.md5s_to_path_infos(md5s) lexists = remote.exists(path_infos) md5s_exist = filter(lambda x: not x[1], list(zip(md5s, lexists))) md5s = [md5 for md5, exists in md5s_exist] cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(md5s, jobs))) progress.set_n_total(len(md5s)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, md5s in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=md5s) futures.append(res) for f in futures: f.result()
def push(self, checksum_infos, remote, jobs=None, show_checksums=False): Logger.info("Preparing to push data to {}".format(remote.url)) title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) checksum_infos = self._collect(checksum_infos)[0] progress.update_target(title, 10, 100) # NOTE: verifying that our cache is not corrupted def func(info): return not self.changed_cache_file(info[self.PARAM_MD5]) checksum_infos = list(filter(func, checksum_infos)) progress.update_target(title, 20, 100) # NOTE: filter files that are already uploaded md5s = [i[self.PARAM_MD5] for i in checksum_infos] exists = remote.exists(remote.md5s_to_path_infos(md5s)) progress.update_target(title, 30, 100) def func(entry): return not entry[0] assert len(exists) == len(checksum_infos) infos_exist = list(filter(func, zip(exists, checksum_infos))) checksum_infos = [i for e, i in infos_exist] progress.update_target(title, 70, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] progress.update_target(title, 80, 100) path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) progress.update_target(title, 90, 100) if jobs is None: jobs = remote.JOBS chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.finish_target(title) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, names in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()