def checkout(self, path_info, checksum_info): if path_info['scheme'] != 'ssh': raise NotImplementedError md5 = checksum_info.get(self.PARAM_MD5, None) if not md5: return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.info(msg.format(self.to_string(path_info))) return if self.changed_cache(md5): msg = "Cache '{}' not found. File '{}' won't be created." logger.warn(msg.format(md5, self.to_string(path_info))) return if self.exists([path_info])[0]: msg = "Data '{}' exists. Removing before checkout." logger.warn(msg.format(self.to_string(path_info))) self.remove(path_info) return msg = "Checking out '{}' with cache '{}'." logger.info(msg.format(self.to_string(path_info), md5)) src = path_info.copy() src['path'] = posixpath.join(self.prefix, md5[0:2], md5[2:]) self.cp(src, path_info)
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): logger.info("Preparing to pull data from {}".format(remote.url)) title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) progress.update_target(title, 10, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) progress.update_target(title, 20, 100) path_infos = remote.md5s_to_path_infos(md5s) progress.update_target(title, 30, 100) remote_exists = remote.exists(path_infos) progress.update_target(title, 90, 100) local_exists = [not self.changed_cache_file(md5) for md5 in md5s] progress.finish_target(title) return [(name, STATUS_MAP[l, r]) for name, l, r in zip(names, local_exists, remote_exists)]
def _save_dir(self, path_info): path = path_info['path'] md5, dir_info = self.state.update_info(path) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE logger.info("Linking directory '{}'.".format(dir_relpath)) for processed, entry in enumerate(dir_info): relpath = entry[self.PARAM_RELPATH] m = entry[self.PARAM_MD5] p = os.path.join(path, relpath) c = self.get(m) if self.changed_cache(m): self._move(p, c) else: remove(p) self.link(c, p) if bar: progress.update_target(dir_relpath, processed, dir_size) self.state.update_link(path) if bar: progress.finish_target(dir_relpath) return {self.PARAM_MD5: md5}
def checkout(self, path_info, checksum_info): if path_info['scheme'] != 's3': raise NotImplementedError etag = checksum_info.get(self.PARAM_ETAG, None) if not etag: return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.info(msg.format(self.to_string(path_info))) return if self.changed_cache(etag): msg = "Cache '{}' not found. File '{}' won't be created." logger.warn(msg.format(etag, self.to_string(path_info))) return if self.exists([path_info])[0]: msg = "Data '{}' exists. Removing before checkout." logger.warn(msg.format(self.to_string(path_info))) self.remove(path_info) return msg = "Checking out '{}' with cache '{}'." logger.info(msg.format(self.to_string(path_info), etag)) key = posixpath.join(self.prefix, etag[0:2], etag[2:]) from_info = {'scheme': 's3', 'bucket': self.bucket, 'key': key} self._copy(from_info, path_info)
def checkout(self, path_info, checksum_info): if path_info['scheme'] != 'hdfs': raise NotImplementedError assert path_info.get('url') checksum = checksum_info.get(self.PARAM_CHECKSUM, None) if not checksum: return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.info(msg.format(self.to_string(path_info))) return if self.changed_cache(checksum): msg = "Cache '{}' not found. File '{}' won't be created." logger.warn(msg.format(checksum, self.to_string(path_info))) return if self.exists([path_info])[0]: msg = "Data '{}' exists. Removing before checkout." logger.warn(msg.format(self.to_string(path_info))) self.remove(path_info) return msg = "Checking out '{}' with cache '{}'." logger.info(msg.format(self.to_string(path_info), checksum)) src = path_info.copy() src['url'] = posixpath.join(self.url, checksum[0:2], checksum[2:]) self.cp(src, path_info)
def loads(project=None, cmd=None, deps=[], outs=[], outs_no_cache=[], metrics_no_cache=[], fname=None, cwd=os.curdir, locked=False, add=False, overwrite=True, ignore_build_cache=False, remove_outs=False): stage = Stage(project=project, cwd=cwd, cmd=cmd, locked=locked) stage.outs = output.loads_from(stage, outs, use_cache=True) stage.outs += output.loads_from(stage, outs_no_cache, use_cache=False) stage.outs += output.loads_from(stage, metrics_no_cache, use_cache=False, metric=True) stage.deps = dependency.loads_from(stage, deps) if fname is not None and os.path.basename(fname) != fname: msg = "Stage file name '{}' should not contain subdirectories. " \ "Use '-c|--cwd' to change location of the stage file." raise StageFileBadNameError(msg.format(fname)) fname, cwd = Stage._stage_fname_cwd(fname, cwd, stage.outs, add=add) Stage._check_inside_project(project, cwd) cwd = os.path.abspath(cwd) path = os.path.join(cwd, fname) stage.cwd = cwd stage.path = path # NOTE: remove outs before we check build cache if remove_outs: stage.remove_outs(ignore_remove=False) project.logger.warn("Build cache is ignored when using " "--remove-outs.") ignore_build_cache = True else: stage.unprotect_outs() if os.path.exists(path): if not ignore_build_cache and stage.is_cached(): logger.info('Stage is cached, skipping.') return None msg = "'{}' already exists. Do you wish to run the command and " \ "overwrite it?".format(stage.relpath) if not overwrite and not project.prompt.prompt(msg, False): raise StageFileAlreadyExistsError(stage.relpath) return stage
def test_stdout(self, mock_stdout, mock_stderr): logger = Logger(force=True) non_error_message = 'non-error message' logger.info(non_error_message) self.assertEqual('', mock_stderr.getvalue()) self.assertEqual('{}\n'.format(non_error_message), mock_stdout.getvalue())
def init(root_dir=os.curdir, no_scm=False, force=False): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ import colorama import shutil from dvc.scm import SCM, Base from dvc.config import Config from dvc.logger import logger root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) scm = SCM(root_dir) if type(scm) == Base and not no_scm: msg = "{} is not tracked by any supported scm tool(e.g. git)." raise InitError(msg.format(root_dir)) if os.path.isdir(dvc_dir): if not force: msg = "'{}' exists. Use '-f' to force." raise InitError(msg.format(os.path.relpath(dvc_dir))) shutil.rmtree(dvc_dir) os.mkdir(dvc_dir) config = Config.init(dvc_dir) proj = Project(root_dir) scm.add([config.config_file]) if scm.ignore_file(): scm.add([os.path.join(dvc_dir, scm.ignore_file())]) logger.info('\nYou can now commit the changes to git.') logger.info( "\n" "{yellow}What's next?{nc}\n" "{yellow}------------{nc}\n" "- Check out the documentation: {blue}https://dvc.org/doc{nc}\n" "- Get help and share ideas: {blue}https://dvc.org/chat{nc}\n" "- Star us on GitHub: {blue}https://github.com/iterative/dvc{nc}" .format(yellow=colorama.Fore.YELLOW, blue=colorama.Fore.BLUE, green=colorama.Fore.GREEN, nc=colorama.Fore.RESET) ) return proj
def show(self, config, section, opt): if section not in config.keys(): raise ConfigError("Section '{}' doesn't exist".format(section)) if opt not in config[section].keys(): raise ConfigError("Option '{}.{}' doesn't exist".format( section, opt)) logger.info(config[section][opt])
def run(self): for section in self.configobj.keys(): r = re.match(Config.SECTION_REMOTE_REGEX, section) if r: name = r.group('name') url = self.configobj[section].get(Config.SECTION_REMOTE_URL, '') logger.info('{}\t{}'.format(name, url)) return 0
def run(self): section = Config.SECTION_REMOTE_FMT.format(self.args.name) ret = self._set(section, Config.SECTION_REMOTE_URL, self.args.url) if ret != 0: return ret if self.args.default: msg = 'Setting \'{}\' as a default remote.'.format(self.args.name) logger.info(msg) ret = self._set(Config.SECTION_CORE, Config.SECTION_CORE_REMOTE, self.args.name) return ret
def dump(self, fname=None): if not fname: fname = self.path self._check_dvc_filename(fname) msg = "Saving information to '{}'.".format(os.path.relpath(fname)) logger.info(msg) with open(fname, 'w') as fd: yaml.safe_dump(self.dumpd(), fd, default_flow_style=False) self.project._files_to_git_add.append(os.path.relpath(fname))
def save(self, path_info): if path_info['scheme'] != 'local': raise NotImplementedError path = path_info['path'] msg = "Saving '{}' to cache '{}'." logger.info( msg.format(os.path.relpath(path), os.path.relpath(self.cache_dir))) if os.path.isdir(path): return self._save_dir(path_info) else: return self._save_file(path_info)
def collect_dir_cache(self, dname): dir_info = [] for root, dirs, files in os.walk(dname): bar = False if len(files) > LARGE_DIR_SIZE: msg = "Computing md5 for a large directory {}. " \ "This is only done once." logger.info(msg.format(os.path.relpath(root))) bar = True title = os.path.relpath(root) processed = 0 total = len(files) progress.update_target(title, 0, total) for fname in files: path = os.path.join(root, fname) relpath = self.unixpath(os.path.relpath(path, dname)) if bar: progress.update_target(title, processed, total) processed += 1 md5 = self.state.update(path) dir_info.append({ self.PARAM_RELPATH: relpath, self.PARAM_MD5: md5 }) if bar: progress.finish_target(title) # NOTE: sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.changed_cache(md5): self.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) size = os.path.getsize(fname) bar = False if size >= LARGE_FILE_SIZE: bar = True msg = "Computing md5 for a large file {}. This is only done once." logger.info(msg.format(os.path.relpath(fname))) name = os.path.relpath(fname) total = 0 with open(fname, 'rb') as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if bar: total += len(data) progress.update_target(name, total, size) if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) if bar: progress.finish_target(name) return (hash_md5.hexdigest(), hash_md5.digest()) else: return (None, None)
def ignore(self, path): entry, gitignore = self._get_gitignore(path) ignore_list = [] if os.path.exists(gitignore): ignore_list = open(gitignore, 'r').readlines() filtered = list( filter(lambda x: x.strip() == entry.strip(), ignore_list)) if len(filtered) != 0: return msg = "Adding '{}' to '{}'.".format(os.path.relpath(path), os.path.relpath(gitignore)) logger.info(msg) content = entry if len(ignore_list) > 0: content = '\n' + content with open(gitignore, 'a') as fd: fd.write(content) if self.project is not None: self.project._files_to_git_add.append(os.path.relpath(gitignore))
def _check_dvc_file(fname): sname = fname + Stage.STAGE_FILE_SUFFIX if Stage.is_stage_file(sname): logger.info("Do you mean '{}'?".format(sname))
def checkout(self, path_info, checksum_info, force=False): path = path_info['path'] md5 = checksum_info.get(self.PARAM_MD5) cache = self.get(md5) if not cache: msg = 'No cache info for \'{}\'. Skipping checkout.' logger.warn(msg.format(os.path.relpath(path))) return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.info(msg.format(os.path.relpath(path))) return if self.changed_cache(md5): msg = u'Cache \'{}\' not found. File \'{}\' won\'t be created.' logger.warn(msg.format(md5, os.path.relpath(path))) remove(path) return msg = u'Checking out \'{}\' with cache \'{}\'.' logger.info(msg.format(os.path.relpath(path), md5)) if not self.is_dir_cache(cache): if os.path.exists(path): if force or self._already_cached(path): remove(path) else: self._safe_remove(path) self.link(cache, path) self.state.update_link(path) return # Create dir separately so that dir is created # even if there are no files in it if not os.path.exists(path): os.makedirs(path) dir_info = self.load_dir_cache(md5) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE logger.info("Linking directory '{}'.".format(dir_relpath)) for processed, entry in enumerate(dir_info): relpath = entry[self.PARAM_RELPATH] m = entry[self.PARAM_MD5] p = os.path.join(path, relpath) c = self.get(m) entry_info = {'scheme': path_info['scheme'], self.PARAM_PATH: p} entry_checksum_info = {self.PARAM_MD5: m} if self.changed(entry_info, entry_checksum_info): if os.path.exists(p): if force or self._already_cached(p): remove(p) else: self._safe_remove(p) self.link(c, p) if bar: progress.update_target(dir_relpath, processed, dir_size) self._discard_working_directory_changes(path, dir_info, force=force) self.state.update_link(path) if bar: progress.finish_target(dir_relpath)
def push(self, checksum_infos, remote, jobs=None, show_checksums=False): logger.info("Preparing to push data to {}".format(remote.url)) title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) # NOTE: verifying that our cache is not corrupted def func(info): return not self.changed_cache_file(info[self.PARAM_MD5]) checksum_infos = list(filter(func, checksum_infos)) progress.update_target(title, 20, 100) # NOTE: filter files that are already uploaded md5s = [i[self.PARAM_MD5] for i in checksum_infos] exists = remote.exists(remote.md5s_to_path_infos(md5s)) progress.update_target(title, 30, 100) def func(entry): return not entry[0] assert len(exists) == len(checksum_infos) infos_exist = list(filter(func, zip(exists, checksum_infos))) checksum_infos = [i for e, i in infos_exist] progress.update_target(title, 70, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] progress.update_target(title, 80, 100) path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) progress.update_target(title, 90, 100) if jobs is None: jobs = remote.JOBS chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.finish_target(title) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, names in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()