def __init__(self, root_dir): from dvc.logger import Logger from dvc.config import Config from dvc.state import LinkState, State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.prompt import Prompt self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) core = self.config._config[Config.SECTION_CORE] self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self.prompt = Prompt() self._ignore() self.updater.check()
def __init__(self, root_dir, dvc_dir, cache_dir=None, cache_type=None): self.cache_type = cache_type cache_dir = cache_dir if cache_dir else self.CACHE_DIR if os.path.isabs(cache_dir): self.cache_dir = cache_dir else: self.cache_dir = os.path.abspath( os.path.realpath(os.path.join(dvc_dir, cache_dir))) if not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) self.state = State(self.cache_dir) self.link_state = LinkState(root_dir, dvc_dir) self.lock = Lock(self.cache_dir, name=self.CACHE_DIR_LOCK)
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.link_state = LinkState(self.root_dir, self.dvc_dir) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
class Project(object): DVC_DIR = '.dvc' def __init__(self, root_dir): from dvc.logger import Logger from dvc.config import Config from dvc.state import LinkState, State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.prompt import Prompt self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) core = self.config._config[Config.SECTION_CORE] self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self.prompt = Prompt() self._ignore() self.updater.check() @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ import shutil from dvc.scm import SCM, Base from dvc.config import Config root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) scm = SCM(root_dir) if type(scm) == Base and not no_scm: msg = "{} is not tracked by any supported scm tool(e.g. git)." raise InitError(msg.format(root_dir)) if os.path.isdir(dvc_dir): if not force: msg = "'{}' exists. Use '-f' to force." raise InitError(msg.format(os.path.relpath(dvc_dir))) shutil.rmtree(dvc_dir) os.mkdir(dvc_dir) config = Config.init(dvc_dir) proj = Project(root_dir) scm.add([config.config_file]) if scm.ignore_file(): scm.add([os.path.join(dvc_dir, scm.ignore_file())]) return proj def destroy(self): import shutil for stage in self.stages(): stage.remove() shutil.rmtree(self.dvc_dir) def _ignore(self): flist = [ self.state.state_file, self.lock.lock_file, self.config.config_local_file, self.updater.updater_file ] if self.cache.local.cache_dir.startswith(self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def install(self): self.scm.install() def to_dvc_path(self, path): return os.path.relpath(path, self.root_dir) def _check_output_duplication(self, outs): from dvc.exceptions import OutputDuplicationError for stage in self.stages(): for o in stage.outs: for out in outs: if o.path == out.path and o.stage.path != out.stage.path: stages = [o.stage.relpath, out.stage.relpath] raise OutputDuplicationError(o.path, stages) def add(self, fname, recursive=False): fnames = [] if recursive and os.path.isdir(fname): fnames = [] for root, dirs, files in os.walk(fname): for f in files: path = os.path.join(root, f) if Stage.is_stage_file(path): continue if os.path.basename(path) == self.scm.ignore_file(): continue if self.scm.is_tracked(path): continue fnames.append(path) else: fnames = [fname] stages = [] for f in fnames: stage = Stage.loads(project=self, outs=[f], add=True) self._check_output_duplication(stage.outs) stage.save() stage.dump() stages.append(stage) return stages def remove(self, target, outs_only=False): stage = Stage.load(self, target) if outs_only: stage.remove_outs() else: stage.remove() return stage def lock_stage(self, target, unlock=False): stage = Stage.load(self, target) stage.locked = False if unlock else True stage.dump() return stage def move(self, from_path, to_path): import dvc.output as Output from_out = Output.loads_from(Stage(self, cwd=os.curdir), [from_path])[0] found = False for stage in self.stages(): for out in stage.outs: if out.path != from_out.path: continue if not stage.is_data_source: msg = 'Dvcfile \'{}\' is not a data source.' raise DvcException(msg.format(stage.rel_path)) found = True to_out = Output.loads_from(out.stage, [to_path], out.cache, out.metric)[0] out.move(to_out) stage_base = os.path.basename(stage.path) stage_base = stage_base.rstrip(Stage.STAGE_FILE_SUFFIX) stage_dir = os.path.dirname(stage.path) from_base = os.path.basename(from_path) to_base = os.path.basename(to_path) if stage_base == from_base: os.unlink(stage.path) path = to_base + Stage.STAGE_FILE_SUFFIX stage.path = os.path.join(stage_dir, path) stage.dump() if not found: msg = 'Unable to find dvcfile with output \'{}\'' raise DvcException(msg.format(from_path)) def run(self, cmd=None, deps=[], outs=[], outs_no_cache=[], metrics_no_cache=[], fname=Stage.STAGE_FILE, cwd=os.curdir, no_exec=False, overwrite=False): stage = Stage.loads(project=self, fname=fname, cmd=cmd, cwd=cwd, outs=outs, outs_no_cache=outs_no_cache, metrics_no_cache=metrics_no_cache, deps=deps, overwrite=overwrite) self._check_output_duplication(stage.outs) if not no_exec: stage.run() stage.dump() return stage def imp(self, url, out): stage = Stage.loads(project=self, cmd=None, deps=[url], outs=[out]) self._check_output_duplication(stage.outs) stage.run() stage.dump() return stage def _reproduce_stage(self, stages, node, force, dry, interactive): stage = stages[node] if stage.locked: msg = 'DVC file \'{}\' is locked. Its dependecies are not ' \ 'going to be reproduced.' self.logger.warn(msg.format(stage.relpath)) stage = stage.reproduce(force=force, dry=dry, interactive=interactive) if not stage: return [] if not dry: stage.dump() return [stage] def reproduce(self, target, recursive=True, force=False, dry=False, interactive=False): import networkx as nx stage = Stage.load(self, target) G = self.graph()[1] stages = nx.get_node_attributes(G, 'stage') node = os.path.relpath(stage.path, self.root_dir) if not interactive: config = self.config core = config._config[config.SECTION_CORE] interactive = core.get(config.SECTION_CORE_INTERACTIVE, False) if recursive: return self._reproduce_stages(G, stages, node, force, dry, interactive) return self._reproduce_stage(stages, node, force, dry, interactive) def _reproduce_stages(self, G, stages, node, force, dry, interactive): import networkx as nx result = [] for n in nx.dfs_postorder_nodes(G, node): try: result += self._reproduce_stage(stages, n, force, dry, interactive) except Exception as ex: raise ReproductionError(stages[n].relpath, ex) return result def _cleanup_unused_links(self, all_stages): used = [] for stage in all_stages: for out in stage.outs: used.append(out.path) self.link_state.remove_unused(used) def checkout(self, target=None): all_stages = self.active_stages() if target: stages = [Stage.load(self, target)] else: stages = all_stages self._cleanup_unused_links(all_stages) for stage in stages: if stage.locked: msg = 'DVC file \'{}\' is locked. Its dependecies are not ' \ 'going to be checked out.' self.logger.warn(msg.format(stage.relpath)) stage.checkout() def _used_cache(self, target=None, all_branches=False, active=True): cache = {} cache['local'] = [] cache['s3'] = [] cache['gs'] = [] cache['hdfs'] = [] cache['ssh'] = [] for branch in self.scm.brancher(all_branches=all_branches): if target: stages = [Stage.load(self, target)] elif active: stages = self.active_stages() else: stages = self.stages() for stage in stages: if active and not target and stage.locked: msg = 'DVC file \'{}\' is locked. Its dependecies are ' \ 'not going to be pushed/pulled/fetched.' self.logger.warn(msg.format(stage.relpath)) for out in stage.outs: if not out.use_cache or not out.info: continue info = out.dumpd() info['branch'] = branch cache[out.path_info['scheme']] += [info] return cache def _do_gc(self, typ, func, clist): removed = func(clist) if not removed: self.logger.info("No unused {} cache to remove.".format(typ)) def gc(self, all_branches=False, cloud=False, remote=None): clist = self._used_cache(target=None, all_branches=all_branches, active=False) self._do_gc('local', self.cache.local.gc, clist) if self.cache.s3: self._do_gc('s3', self.cache.s3.gc, clist) if self.cache.gs: self._do_gc('gs', self.cache.gs.gc, clist) if self.cache.ssh: self._do_gc('ssh', self.cache.ssh.gc, clist) if self.cache.hdfs: self._do_gc('hdfs', self.cache.hdfs.gc, clist) if self.cache.azure: self._do_gc('azure', self.cache.azure.gc, clist) if cloud: self._do_gc('remote', self.cloud._get_cloud(remote, 'gc -c').gc, clist) def push(self, target=None, jobs=1, remote=None, all_branches=False, show_checksums=False): self.cloud.push(self._used_cache(target, all_branches)['local'], jobs, remote=remote, show_checksums=show_checksums) def fetch(self, target=None, jobs=1, remote=None, all_branches=False, show_checksums=False): self.cloud.pull(self._used_cache(target, all_branches)['local'], jobs, remote=remote, show_checksums=show_checksums) def pull(self, target=None, jobs=1, remote=None, all_branches=False, show_checksums=False): self.fetch(target, jobs, remote=remote, all_branches=all_branches, show_checksums=show_checksums) self.checkout(target=target) def _local_status(self, target=None): status = {} if target: stages = [Stage.load(self, target)] else: stages = self.active_stages() for stage in stages: if stage.locked: msg = 'DVC file \'{}\' is locked. Its dependecies are not ' \ 'going to be shown in status output.' self.logger.warn(msg.format(stage.relpath)) status.update(stage.status()) return status def _cloud_status(self, target=None, jobs=1, remote=None, show_checksums=False): import dvc.remote.base as cloud status = {} for md5, ret in self.cloud.status(self._used_cache(target)['local'], jobs, remote=remote, show_checksums=show_checksums): if ret == cloud.STATUS_OK: continue prefix_map = { cloud.STATUS_DELETED: 'deleted', cloud.STATUS_NEW: 'new', } status[md5] = prefix_map[ret] return status def status(self, target=None, jobs=1, cloud=False, remote=None, show_checksums=False): if cloud: return self._cloud_status(target, jobs, remote=remote, show_checksums=show_checksums) return self._local_status(target) def _read_metric_json(self, fd, json_path): import json from jsonpath_rw import parse parser = parse(json_path) return [x.value for x in parser.find(json.load(fd))] def _do_read_metric_xsv(self, reader, row, col): if col is not None and row is not None: return [reader[row][col]] elif col is not None: return [r[col] for r in reader] elif row is not None: return reader[row] return None def _read_metric_hxsv(self, fd, hxsv_path, delimiter): import csv col, row = hxsv_path.split(',') row = int(row) reader = list(csv.DictReader(fd, delimiter=delimiter)) return self._do_read_metric_xsv(reader, row, col) def _read_metric_xsv(self, fd, xsv_path, delimiter): import csv col, row = xsv_path.split(',') row = int(row) col = int(col) reader = list(csv.reader(fd, delimiter=delimiter)) return self._do_read_metric_xsv(reader, row, col) def _read_metric(self, path, typ=None, xpath=None): ret = None if not os.path.exists(path): return ret try: with open(path, 'r') as fd: if typ == 'json': ret = self._read_metric_json(fd, xpath) elif typ == 'csv': ret = self._read_metric_xsv(fd, xpath, ',') elif typ == 'tsv': ret = self._read_metric_xsv(fd, xpath, '\t') elif typ == 'hcsv': ret = self._read_metric_hxsv(fd, xpath, ',') elif typ == 'htsv': ret = self._read_metric_hxsv(fd, xpath, '\t') else: ret = fd.read() except Exception as exc: self.logger.error('Unable to read metric in \'{}\''.format(path), exc) return ret def _find_output_by_path(self, path, outs=None): from dvc.exceptions import OutputDuplicationError if not outs: astages = self.active_stages() outs = [out for stage in astages for out in stage.outs] abs_path = os.path.abspath(path) matched = [out for out in outs if out.path == abs_path] stages = [out.stage.relpath for out in matched] if len(stages) > 1: raise OutputDuplicationError(path, stages) return matched[0] if matched else None def metrics_show(self, path=None, typ=None, xpath=None, all_branches=False): res = {} for branch in self.scm.brancher(all_branches=all_branches): astages = self.active_stages() outs = [out for stage in astages for out in stage.outs] if path: out = self._find_output_by_path(path, outs=outs) stage = out.stage.path if out else None if out and all( [out.metric, not typ, isinstance(out.metric, dict)]): entries = [(path, out.metric.get(out.PARAM_METRIC_TYPE, None), out.metric.get(out.PARAM_METRIC_XPATH, None))] else: entries = [(path, typ, xpath)] else: metrics = filter(lambda o: o.metric, outs) stage = None entries = [] for o in metrics: if not typ and isinstance(o.metric, dict): t = o.metric.get(o.PARAM_METRIC_TYPE, typ) x = o.metric.get(o.PARAM_METRIC_XPATH, xpath) else: t = typ x = xpath entries.append((o.path, t, x)) for fname, t, x in entries: if stage: self.checkout(stage) rel = os.path.relpath(fname) metric = self._read_metric(fname, typ=t, xpath=x) if not metric: continue if branch not in res: res[branch] = {} res[branch][rel] = metric for branch, val in res.items(): if all_branches: self.logger.info('{}:'.format(branch)) for fname, metric in val.items(): self.logger.info('\t{}: {}'.format(fname, metric)) if res: return res if path: msg = 'File \'{}\' does not exist'.format(path) else: msg = 'No metric files in this repository. ' \ 'Use \'dvc metrics add\' to add a metric file to track.' raise DvcException(msg) def _metrics_modify(self, path, typ=None, xpath=None, delete=False): out = self._find_output_by_path(path) if not out: msg = 'Unable to find file \'{}\' in the pipeline'.format(path) raise DvcException(msg) if out.path_info['scheme'] != 'local': msg = 'Output \'{}\' scheme \'{}\' is not supported for metrics' raise DvcException(msg.format(out.path, out.path_info['scheme'])) if out.use_cache: msg = 'Cached output \'{}\' is not supported for metrics' raise DvcException(msg.format(out.rel_path)) if typ: if not isinstance(out.metric, dict): out.metric = {} out.metric[out.PARAM_METRIC_TYPE] = typ if xpath: if not isinstance(out.metric, dict): out.metric = {} out.metric[out.PARAM_METRIC_XPATH] = xpath if delete: out.metric = None out._verify_metric() out.stage.dump() def metrics_modify(self, path=None, typ=None, xpath=None): self._metrics_modify(path, typ, xpath) def metrics_add(self, path, typ=None, xpath=None): if not typ: typ = 'raw' self._metrics_modify(path, typ, xpath) def metrics_remove(self, path): self._metrics_modify(path, delete=True) def graph(self): import networkx as nx from dvc.exceptions import OutputDuplicationError G = nx.DiGraph() G_active = nx.DiGraph() stages = self.stages() outs = [] outs_by_path = {} for stage in stages: for o in stage.outs: existing = outs_by_path.get(o.path, None) if existing is not None: stages = [o.stage.relpath, existing.stage.relpath] raise OutputDuplicationError(o.path, stages) outs.append(o) outs_by_path[o.path] = o # collect the whole DAG for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if out.path != dep.path \ and not dep.path.startswith(out.path + out.sep): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) return G, G_active def pipelines(self): import networkx as nx G, G_active = self.graph() if len(G.nodes()) == 0: return [] # find pipeline ends aka "output stages" ends = [node for node, in_degree in G.in_degree() if in_degree == 0] # filter out subgraphs that didn't exist in original G pipelines = [] for c in nx.weakly_connected_components(G_active): H = G_active.subgraph(c) found = False for node in ends: if node in H: found = True break if found: pipelines.append(H) return pipelines def stages(self): stages = [] outs = [] for root, dirs, files in os.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_stage_file(path): continue stage = Stage.load(self, path) for out in stage.outs: outs.append(out.path + out.sep) stages.append(stage) def filter_dirs(dname): path = os.path.join(root, dname) if path == self.dvc_dir or path == self.scm.dir: return False for out in outs: if path == os.path.normpath(out) or path.startswith(out): return False return True dirs[:] = list(filter(filter_dirs, dirs)) return stages def active_stages(self): import networkx as nx stages = [] for G in self.pipelines(): stages.extend(list(nx.get_node_attributes(G, 'stage').values())) return stages
class Project(object): DVC_DIR = '.dvc' def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check() @staticmethod def init(root_dir=os.curdir, no_scm=False): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) scm = SCM(root_dir) if type(scm) == Base and not no_scm: msg = "{} is not tracked by any supported scm tool(e.g. git).".format( root_dir) raise InitError(msg) os.mkdir(dvc_dir) config = Config.init(dvc_dir) proj = Project(root_dir) scm.add([config.config_file]) if scm.ignore_file(): scm.add([os.path.join(dvc_dir, scm.ignore_file())]) return proj def _ignore(self): l = [ self.link_state.state_file, self.link_state._lock_file.lock_file, self.lock.lock_file, self.config.config_local_file, self.updater.updater_file ] if self.cache.local.cache_dir.startswith(self.root_dir): l += [self.cache.local.cache_dir] self.scm.ignore_list(l) def install(self): self.scm.install() def to_dvc_path(self, path): return os.path.relpath(path, self.root_dir) def add(self, fname): out = os.path.basename(os.path.normpath(fname)) stage_fname = out + Stage.STAGE_FILE_SUFFIX cwd = os.path.dirname(os.path.abspath(fname)) stage = Stage.loads(project=self, cmd=None, deps=[], outs=[out], fname=stage_fname, cwd=cwd) stage.save() stage.dump() return stage def remove(self, target, outs_only=False): if not Stage.is_stage_file(target): raise NotDvcFileError(target) stage = Stage.load(self, target) if outs_only: stage.remove_outs() else: stage.remove() return stage def lock_stage(self, target, unlock=False): if not Stage.is_stage_file(target): raise NotDvcFileError(target) stage = Stage.load(self, target) stage.locked = False if unlock else True stage.dump() return stage def move(self, from_path, to_path): from_out = Output.loads_from(Stage(self, cwd=os.curdir), [from_path])[0] found = False for stage in self.stages(): for out in stage.outs: if out.path != from_out.path: continue if not stage.is_data_source: raise DvcException( 'Dvcfile \'{}\' is not a data source.'.format( stage.rel_path)) found = True to_out = Output.loads_from(out.stage, [to_path], out.cache, out.metric)[0] out.move(to_out) stage_base = os.path.basename(stage.path).rstrip( Stage.STAGE_FILE_SUFFIX) stage_dir = os.path.dirname(stage.path) from_base = os.path.basename(from_path) to_base = os.path.basename(to_path) if stage_base == from_base: os.unlink(stage.path) stage.path = os.path.join( stage_dir, to_base + Stage.STAGE_FILE_SUFFIX) stage.dump() if not found: raise DvcException( 'Unable to find dvcfile with output \'{}\''.format(from_path)) def run(self, cmd=None, deps=[], outs=[], outs_no_cache=[], metrics_no_cache=[], fname=Stage.STAGE_FILE, cwd=os.curdir, no_exec=False): stage = Stage.loads(project=self, fname=fname, cmd=cmd, cwd=cwd, outs=outs, outs_no_cache=outs_no_cache, metrics_no_cache=metrics_no_cache, deps=deps) if not no_exec: stage.run() stage.dump() return stage def imp(self, url, out): stage_fname = out + Stage.STAGE_FILE_SUFFIX cwd = os.path.dirname(os.path.abspath(out)) stage = Stage.loads(project=self, cmd=None, deps=[url], outs=[out], fname=stage_fname, cwd=cwd) stage.run() stage.dump() return stage def _reproduce_stage(self, stages, node, force): stage = stages[node].reproduce(force=force) if not stage: return [] stage.dump() return [stage] def reproduce(self, target, recursive=True, force=False): stages = nx.get_node_attributes(self.graph(), 'stage') node = os.path.relpath(os.path.abspath(target), self.root_dir) if node not in stages: raise NotDvcFileError(target) if recursive: return self._reproduce_stages(stages, node, force) return self._reproduce_stage(stages, node, force) def _reproduce_stages(self, stages, node, force): result = [] for n in nx.dfs_postorder_nodes(self.graph(), node): try: result += self._reproduce_stage(stages, n, force) except Exception as ex: raise ReproductionError(stages[n].relpath, ex) return result def _cleanup_unused_links(self, all_stages): used = [] for stage in all_stages: for out in stage.outs: used.append(out.path) self.link_state.remove_unused(used) def checkout(self, target=None): all_stages = self.stages() if target: if not Stage.is_stage_file(target): raise NotDvcFileError(target) stages = [Stage.load(self, target)] else: stages = all_stages self._cleanup_unused_links(all_stages) for stage in stages: stage.checkout() def _used_cache(self, target=None, all_branches=False): cache = {} cache['local'] = [] cache['s3'] = [] cache['gs'] = [] cache['hdfs'] = [] cache['ssh'] = [] for branch in self.scm.brancher(all_branches=all_branches): if target: stages = [Stage.load(self, target)] else: stages = self.stages() for stage in stages: for out in stage.outs: if not out.use_cache or not out.info: continue cache[out.path_info['scheme']] += [out.info] return cache def gc(self, all_branches=False): clist = self._used_cache(target=None, all_branches=all_branches) self.cache.local.gc(clist['local']) if self.cache.s3: self.cache.s3.gc(clist['s3']) if self.cache.gs: self.cache.gs.gc(clist['gs']) if self.cache.ssh: self.cache.ssh.gc(clist['ssh']) if self.cache.hdfs: self.cache.hdfs.gc(clist['hdfs']) def push(self, target=None, jobs=1, remote=None, all_branches=False): self.cloud.push(self._used_cache(target, all_branches)['local'], jobs, remote=remote) def fetch(self, target=None, jobs=1, remote=None, all_branches=False): self.cloud.pull(self._used_cache(target, all_branches)['local'], jobs, remote=remote) def pull(self, target=None, jobs=1, remote=None, all_branches=False): self.fetch(target, jobs, remote=remote, all_branches=all_branches) self.checkout(target=target) def _local_status(self, target=None): status = {} if target: stages = [Stage.load(self, target)] else: stages = self.stages() for stage in stages: status.update(stage.status()) return status def _cloud_status(self, target=None, jobs=1, remote=None): import dvc.remote.base as cloud status = {} for md5, ret in self.cloud.status(self._used_cache(target)['local'], jobs, remote=remote): if ret == cloud.STATUS_OK: continue prefix_map = { cloud.STATUS_DELETED: 'deleted', cloud.STATUS_NEW: 'new', } status[md5] = prefix_map[ret] return status def status(self, target=None, jobs=1, cloud=False, remote=None): if cloud: return self._cloud_status(target, jobs, remote=remote) return self._local_status(target) def _read_metric_json(self, fd, json_path): parser = parse(json_path) return [x.value for x in parser.find(json.load(fd))] def _do_read_metric_tsv(self, reader, row, col): if col != None and row != None: return [reader[row][col]] elif col != None: return [r[col] for r in reader] elif row != None: return reader[row] return None def _read_metric_htsv(self, fd, htsv_path): col, row = htsv_path.split(',') row = int(row) reader = list(csv.DictReader(fd, delimiter='\t')) return self._do_read_metric_tsv(reader, row, col) def _read_metric_tsv(self, fd, tsv_path): col, row = tsv_path.split(',') row = int(row) col = int(col) reader = list(csv.reader(fd, delimiter='\t')) return self._do_read_metric_tsv(reader, row, col) def _read_metric(self, path, json_path=None, tsv_path=None, htsv_path=None): ret = None if not os.path.exists(path): return ret try: with open(path, 'r') as fd: if json_path: ret = self._read_metric_json(fd, json_path) elif tsv_path: ret = self._read_metric_tsv(fd, tsv_path) elif htsv_path: ret = self._read_metric_htsv(fd, htsv_path) else: ret = fd.read() except Exception as exc: self.logger.debug('Unable to read metric in \'{}\''.format(path), exc) return ret def metrics_show(self, path=None, json_path=None, tsv_path=None, htsv_path=None, all_branches=False): res = {} for branch in self.scm.brancher(all_branches=all_branches): metrics = filter(lambda o: o.metric, self.outs()) fnames = [path] if path else map(lambda o: o.path, metrics) for fname in fnames: rel = os.path.relpath(fname) metric = self._read_metric(fname, json_path=json_path, tsv_path=tsv_path, htsv_path=htsv_path) if not metric: continue if branch not in res: res[branch] = {} res[branch][rel] = metric for branch, val in res.items(): self.logger.info('{}:'.format(branch)) for fname, metric in val.items(): self.logger.info('\t{}: {}'.format(fname, metric)) return res def _metrics_modify(self, path, val): apath = os.path.abspath(path) for stage in self.stages(): for out in stage.outs: if apath != out.path: continue if out.path_info['scheme'] != 'local': msg = 'Output \'{}\' scheme \'{}\' is not supported for metrics' raise DvcException( msg.format(out.path, out.path_info['scheme'])) if out.use_cache: msg = 'Cached output \'{}\' is not supported for metrics' raise DvcException(msg.format(out.rel_path)) out.metric = val stage.dump() def metrics_add(self, path): self._metrics_modify(path, True) def metrics_remove(self, path): self._metrics_modify(path, False) def graph(self): G = nx.DiGraph() stages = self.stages() outs = self.outs() for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) if stage.locked: continue for dep in stage.deps: for out in outs: if out.path != dep.path and not dep.path.startswith( out.path + out.sep): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) return G def stages(self): stages = [] for root, dirs, files in os.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_stage_file(path): continue stages.append(Stage.load(self, path)) return stages def outs(self): outs = [] for stage in self.stages(): outs += stage.outs return outs
class Cache(object): CACHE_DIR = 'cache' CACHE_DIR_LOCK = 'cache.lock' CACHE_TYPES = ['reflink', 'hardlink', 'symlink', 'copy'] CACHE_TYPE_MAP = { 'copy': shutil.copyfile, 'symlink': System.symlink, 'hardlink': System.hardlink, 'reflink': System.reflink, } def __init__(self, root_dir, dvc_dir, cache_dir=None, cache_type=None): self.cache_type = cache_type cache_dir = cache_dir if cache_dir else self.CACHE_DIR if os.path.isabs(cache_dir): self.cache_dir = cache_dir else: self.cache_dir = os.path.abspath( os.path.realpath(os.path.join(dvc_dir, cache_dir))) if not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) self.state = State(self.cache_dir) self.link_state = LinkState(root_dir, dvc_dir) self.lock = Lock(self.cache_dir, name=self.CACHE_DIR_LOCK) @staticmethod def init(root_dir, dvc_dir, cache_dir=None): return Cache(root_dir, dvc_dir, cache_dir=None) def all(self): with self.lock: clist = [] for entry in os.listdir(self.cache_dir): subdir = os.path.join(self.cache_dir, entry) if not os.path.isdir(subdir): continue for cache in os.listdir(subdir): path = os.path.join(subdir, cache) clist.append(path) return clist def get(self, md5): if not md5: return None return os.path.join(self.cache_dir, md5[0:2], md5[2:]) def path_to_md5(self, path): relpath = os.path.relpath(path, self.cache_dir) return os.path.dirname(relpath) + os.path.basename(relpath) def _changed(self, md5): cache = self.get(md5) if self.state.changed(cache, md5=md5): if os.path.exists(cache): Logger.warn('Corrupted cache file {}'.format( os.path.relpath(cache))) remove(cache) return True return False def changed(self, md5): with self.lock: return self._changed(md5) def link(self, src, link): dname = os.path.dirname(link) if not os.path.exists(dname): os.makedirs(dname) if self.cache_type != None: types = [self.cache_type] else: types = self.CACHE_TYPES for typ in types: try: self.CACHE_TYPE_MAP[typ](src, link) self.link_state.update(link) return except Exception as exc: msg = 'Cache type \'{}\' is not supported'.format(typ) Logger.debug(msg) if typ == types[-1]: raise DvcException(msg, cause=exc) @staticmethod def load_dir_cache(path): if os.path.isabs(path): relpath = os.path.relpath(path) else: relpath = path try: with open(path, 'r') as fd: d = json.load(fd) except Exception as exc: msg = u'Failed to load dir cache \'{}\'' Logger.error(msg.format(relpath), exc) return [] if not isinstance(d, list): msg = u'Dir cache file format error \'{}\': skipping the file' Logger.error(msg.format(relpath)) return [] return d @staticmethod def get_dir_cache(path): res = {} d = Cache.load_dir_cache(path) for entry in d: res[entry[State.PARAM_RELPATH]] = entry[State.PARAM_MD5] return res def dir_cache(self, cache): res = {} dir_cache = self.get_dir_cache(cache) for relpath, md5 in dir_cache.items(): res[relpath] = self.get(md5) return res @staticmethod def is_dir_cache(cache): return cache.endswith(State.MD5_DIR_SUFFIX) def _checkout(self, path, md5): cache = self.get(md5) if not cache or not os.path.exists(cache) or self._changed(md5): if cache: Logger.warn(u'\'{}({})\': cache file not found'.format( os.path.relpath(cache), os.path.relpath(path))) remove(path) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) msg = u'Checking out \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache))) if not self.is_dir_cache(cache): self.link(cache, path) return dir_cache = self.dir_cache(cache) for relpath, c in dir_cache.items(): p = os.path.join(path, relpath) self.link(c, p) def checkout(self, path, md5): with self.lock: return self._checkout(path, md5) def _save_file(self, path): md5 = self.state.update(path) cache = self.get(md5) if self._changed(md5): move(path, cache) self.state.update(cache) self._checkout(path, md5) def _save_dir(self, path): md5 = self.state.update(path) cache = self.get(md5) dname = os.path.dirname(cache) dir_info = self.state.collect_dir(path) for entry in dir_info: relpath = entry[State.PARAM_RELPATH] p = os.path.join(path, relpath) self._save_file(p) if not os.path.isdir(dname): os.makedirs(dname) with open(cache, 'w+') as fd: json.dump(dir_info, fd, sort_keys=True) def save(self, path): with self.lock: if os.path.isdir(path): self._save_dir(path) else: self._save_file(path)
class Project(object): DVC_DIR = '.dvc' def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.link_state = LinkState(self.root_dir, self.dvc_dir) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(cache=self.cache, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check() @staticmethod def init(root_dir=os.curdir, no_scm=False): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) scm = SCM(root_dir) if type(scm) == Base and not no_scm: msg = "{} is not tracked by any supported scm tool(e.g. git).".format( root_dir) raise InitError(msg) os.mkdir(dvc_dir) config = Config.init(dvc_dir) proj = Project(root_dir) scm.add([config.config_file]) if scm.ignore_file(): scm.add([os.path.join(dvc_dir, scm.ignore_file())]) return proj def _ignore(self): l = [ self.link_state.state_file, self.link_state._lock_file.lock_file, self.lock.lock_file, self.config.config_local_file, self.updater.updater_file ] if self.cache.local.cache_dir.startswith(self.root_dir): l += [self.cache.local.cache_dir] self.scm.ignore_list(l) def install(self): self.scm.install() def to_dvc_path(self, path): return os.path.relpath(path, self.root_dir) def add(self, fname): out = os.path.basename(os.path.normpath(fname)) stage_fname = out + Stage.STAGE_FILE_SUFFIX cwd = os.path.dirname(os.path.abspath(fname)) stage = Stage.loads(project=self, cmd=None, deps=[], outs=[out], fname=stage_fname, cwd=cwd) stage.save() stage.dump() return stage def remove(self, target): if not Stage.is_stage_file(target): raise StageNotFoundError(target) stage = Stage.load(self, target) for out in stage.outs: out.remove() return stage def run(self, cmd=None, deps=[], outs=[], outs_no_cache=[], fname=Stage.STAGE_FILE, cwd=os.curdir, no_exec=False): stage = Stage.loads(project=self, fname=fname, cmd=cmd, cwd=cwd, outs=outs, outs_no_cache=outs_no_cache, deps=deps) if not no_exec: stage.run() stage.dump() return stage def _reproduce_stage(self, stages, node, force): stage = stages[node].reproduce(force=force) if not stage: return [] stage.dump() return [stage] def reproduce(self, target, recursive=True, force=False): stages = nx.get_node_attributes(self.graph(), 'stage') node = os.path.relpath(os.path.abspath(target), self.root_dir) if node not in stages: raise StageNotFoundError(target) if recursive: return self._reproduce_stages(stages, node, force) return self._reproduce_stage(stages, node, force) def _reproduce_stages(self, stages, node, force): result = [] for n in nx.dfs_postorder_nodes(self.graph(), node): try: result += self._reproduce_stage(stages, n, force) except Exception as ex: raise ReproductionError(stages[n].relpath, ex) return result def checkout(self, target=None): if target: if not Stage.is_stage_file(target): raise StageNotFoundError(target) stages = [Stage.load(self, target)] else: self.link_state.remove_all() stages = self.stages() for stage in stages: stage.checkout() def _used_cache(self, target=None): cache_set = set() if target: stages = [Stage.load(self, target)] else: stages = self.stages() for stage in stages: for out in stage.outs: if out.path_info['scheme'] != 'local': continue if not out.use_cache or not out.cache: continue cache_set |= set([out.cache]) if self.cache.local.is_dir_cache(out.cache) and os.path.isfile( out.cache): dir_cache = self.cache.local.dir_cache(out.cache) cache_set |= set(dir_cache.values()) return list(cache_set) def gc(self): clist = self._used_cache() for cache in self.cache.local.all(): if cache in clist: continue os.unlink(cache) self.logger.info(u'\'{}\' was removed'.format( self.to_dvc_path(cache))) def push(self, target=None, jobs=1, remote=None): return self.cloud.push(self._used_cache(target), jobs, remote=remote) def fetch(self, target=None, jobs=1, remote=None): return self.cloud.pull(self._used_cache(target), jobs, remote=remote) def pull(self, target=None, jobs=1, remote=None): ret = self.fetch(target, jobs, remote=remote) self.checkout() return ret def _local_status(self, target=None): status = {} if target: stages = [Stage.load(self, target)] else: stages = self.stages() for stage in stages: status.update(stage.status()) return status def _cloud_status(self, target=None, jobs=1, remote=None): status = {} for target, ret in self.cloud.status(self._used_cache(target), jobs, remote=remote): if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK: continue prefix_map = { cloud.STATUS_DELETED: 'deleted', cloud.STATUS_MODIFIED: 'modified', cloud.STATUS_NEW: 'new', } path = os.path.relpath(target, self.cache.local.cache_dir) status[path] = prefix_map[ret] return status def status(self, target=None, jobs=1, cloud=False, remote=None): if cloud: return self._cloud_status(target, jobs, remote=remote) return self._local_status(target) def _read_metric_json(self, fd, json_path): parser = parse(json_path) return [x.value for x in parser.find(json.load(fd))] def _do_read_metric_tsv(self, reader, row, col): if col != None and row != None: return [reader[row][col]] elif col != None: return [r[col] for r in reader] elif row != None: return reader[row] return None def _read_metric_htsv(self, fd, htsv_path): col, row = htsv_path.split(',') row = int(row) reader = list(csv.DictReader(fd, delimiter='\t')) return self._do_read_metric_tsv(reader, row, col) def _read_metric_tsv(self, fd, tsv_path): col, row = tsv_path.split(',') row = int(row) col = int(col) reader = list(csv.reader(fd, delimiter='\t')) return self._do_read_metric_tsv(reader, row, col) def _read_metric(self, path, json_path=None, tsv_path=None, htsv_path=None): ret = None try: with open(path, 'r') as fd: if json_path: ret = self._read_metric_json(fd, json_path) elif tsv_path: ret = self._read_metric_tsv(fd, tsv_path) elif htsv_path: ret = self._read_metric_htsv(fd, htsv_path) else: ret = fd.read() except Exception as exc: self.logger.error('Unable to read metric in \'{}\''.format(path), exc) return ret def metrics(self, path, json_path=None, tsv_path=None, htsv_path=None): res = {} saved = self.scm.active_branch() for branch in self.scm.list_branches(): self.scm.checkout(branch) self.checkout() res[branch] = self._read_metric(path, json_path=json_path, tsv_path=tsv_path, htsv_path=htsv_path) self.scm.checkout(saved) self.checkout() return res def graph(self): G = nx.DiGraph() stages = self.stages() outs_map = {} for stage in stages: for o in stage.outs: outs_map[o.path] = stage for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) for dep in stage.deps: dep_stage = outs_map.get(dep.path, None) if not dep_stage: continue dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) return G def stages(self): stages = [] for root, dirs, files in os.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_stage_file(path): continue stages.append(Stage.load(self, path)) return stages def outs(self): outs = [] for stage in self.stages(): outs += stage.outs return outs