def create_state_files(self, targets, lock): """ Create state files for all targets. """ for t in targets: orig_target, processed_data_item = t input, data_item = orig_target output = data_item.data.relative if processed_data_item == None: Logger.debug( 'Skipping creating state file for failed import {}'.format( data_item.state.relative)) continue Logger.debug('Creating symlink {} --> {}'.format( data_item.symlink_file, data_item.data.relative)) System.symlink(data_item.symlink_file, data_item.data.relative) state_file = StateFile(StateFile.COMMAND_IMPORT_FILE, data_item, self.settings, argv=[input, output], input_files=[], output_files=[output], lock=lock) state_file.save() Logger.debug('State file "{}" was created'.format( data_item.state.relative))
def run_command(self, cmd_args, data_items_from_args, not_data_items_from_args, stdout=None, stderr=None, shell=False): Logger.debug( 'Run command with args: {}. Data items from args: {}. stdout={}, stderr={}, shell={}' .format(' '.join(cmd_args), ', '.join([x.data.dvc for x in data_items_from_args]), stdout, stderr, shell)) repo_change = RepositoryChange(cmd_args, self.settings, stdout, stderr, shell=shell) if not self.no_git_actions and not self._validate_file_states( repo_change): self.remove_new_files(repo_change) raise RunError('Errors occurred.') output_set = set(self.declaration_output_data_items + repo_change.changed_data_items) output_files_dvc = [x.data.dvc for x in output_set] input_set = set(data_items_from_args + self.declaration_input_data_items) - output_set input_files_dvc = [x.data.dvc for x in input_set] code_dependencies_dvc = self.git.abs_paths_to_dvc( self.code_dependencies + not_data_items_from_args) result = [] for data_item in repo_change.changed_data_items: Logger.debug( 'Move output file "{}" to cache dir "{}" and create a symlink'. format(data_item.data.relative, data_item.cache.relative)) data_item.move_data_to_cache() Logger.debug('Create state file "{}"'.format( data_item.state.relative)) state_file = StateFile(StateFile.COMMAND_RUN, data_item.state.relative, self.settings, input_files_dvc, output_files_dvc, code_dependencies_dvc, argv=cmd_args, lock=self.lock, stdout=self._stdout_to_dvc(stdout), stderr=self._stdout_to_dvc(stderr), shell=shell) state_file.save() result.append(state_file) return result
def update_state_file(settings, state): Logger.debug('Update state file "{}"'.format(state.path)) state.out = StateFile.parse_deps_state(settings, state.out, currdir=state.cwd) state.out_git = StateFile.parse_deps_state(settings, state.out_git, currdir=state.cwd) state.deps = StateFile.parse_deps_state(settings, state.deps, currdir=state.cwd) state.save()
def collect_targets(self): targets = [] for fname in self.git.get_last_merge_changed_files(): if not StateFile._is_state_file(fname): continue state = StateFile.load(fname) if not state.cmd and state.locked: targets.append(fname) return targets
def import_file(self, input, output, lock=False): if not CmdImportFile.is_url(input): if not os.path.exists(input): raise ImportFileError( 'Input file "{}" does not exist'.format(input)) if not os.path.isfile(input): raise ImportFileError( 'Input file "{}" has to be a regular file'.format(input)) if os.path.isdir(output): output = os.path.join(output, os.path.basename(input)) data_item = self.settings.path_factory.data_item(output) if os.path.exists(data_item.data.relative): raise ImportFileError('Output file "{}" already exists'.format( data_item.data.relative)) if not os.path.isdir(os.path.dirname(data_item.data.relative)): raise ImportFileError( 'Output file directory "{}" does not exists'.format( os.path.dirname(data_item.data.relative))) cache_dir = os.path.dirname(data_item.cache.relative) if not os.path.exists(cache_dir): os.makedirs(cache_dir) if CmdImportFile.is_url(input): Logger.debug('Downloading file {} ...'.format(input)) self.download_file(input, data_item.cache.relative) Logger.debug('Input file "{}" was downloaded to cache "{}"'.format( input, data_item.cache.relative)) else: copyfile(input, data_item.cache.relative) Logger.debug('Input file "{}" was copied to cache "{}"'.format( input, data_item.cache.relative)) Logger.debug('Creating symlink {} --> {}'.format( data_item.symlink_file, data_item.data.relative)) System.symlink(data_item.symlink_file, data_item.data.relative) state_file = StateFile(StateFile.COMMAND_IMPORT_FILE, data_item.state.relative, self.settings, argv=[input, output], input_files=[], output_files=[output], lock=lock) state_file.save() Logger.debug('State file "{}" was created'.format( data_item.state.relative)) pass
def run(self): recursive = not self.parsed_args.single_item stages = [] for target in self.parsed_args.targets: if StateFile._is_state_file(target): stage = StateFile.load(target) else: stage = StateFile.find_by_output(self.settings, target) if stage: stages.append(stage) self.repro_stages(stages, recursive, self.parsed_args.force) names = [os.path.relpath(stage.path) for stage in stages] return self.commit_if_needed('DVC repro: {}'.format(names))
def __init__(self, data_item, cmd_obj, globally_changed_files, recursive, force): self._data_item = data_item self.git = cmd_obj.git self._cmd_obj = cmd_obj self._globally_changed_files = globally_changed_files self._recursive = recursive self._force = force if not System.islink(data_item.data.relative): raise ReproError('data item {} is not symlink'.format( data_item.data.relative)) try: self._state = StateFile.load(data_item.state.relative, self.git) except Exception as ex: raise ReproError( 'Error: state file "{}" cannot be loaded: {}'.format( data_item.state.relative, ex)) if not self.state.argv: raise ReproError( 'Error: parameter {} is not defined in state file "{}"'.format( StateFile.PARAM_ARGV, data_item.state.relative)) if len(self.state.argv) < 1: raise ReproError( 'Error: reproducible cmd in state file "{}" is too short'. format(self.state.file)) self._settings = copy.copy(self._cmd_obj.settings) self._settings.set_args(self.state.argv) pass
def lock_files(self, files, target): cmd = 'lock' if target else 'unlock' error = 0 for file in files: try: data_item = self.settings.path_factory.existing_data_item(file) state = StateFile.load(data_item.state.relative, self.settings) if state.locked and target: Logger.warn('Data item {} is already locked'.format( data_item.data.relative)) elif not state.locked and not target: Logger.warn('Data item {} is already unlocked'.format( data_item.data.relative)) else: state.locked = target Logger.debug('Saving status file for data item {}'.format( data_item.data.relative)) state.save() Logger.info('Data item {} was {}ed'.format( data_item.data.relative, cmd)) except Exception as ex: error += 1 Logger.error('Unable to {} {}: {}'.format(cmd, file, ex)) if error > 0 and not self.no_git_actions: Logger.error( 'Errors occurred. One or more repro cmd was not successful.') self.not_committed_changes_warning() else: self.commit_if_needed('DVC lock: {}'.format(' '.join(self.args))) return 0
def is_repro_required(self, changed_files, data_item): state_file = StateFile.load(data_item.state.relative, self._settings) if state_file.locked: Logger.debug( u'Repro is not required for locked data item {}'.format( data_item.data.relative)) return False is_dependency_check_required = self._recursive if not is_dependency_check_required and not self.is_cache_exists(): is_dependency_check_required = True Logger.info( u'Reproduction {}. Force dependency check since cache file is missing.' .format(self._data_item.data.relative)) if is_dependency_check_required: if self.were_dependencies_changed(changed_files, data_item.data.dvc): self.log_repro_reason(u'input dependencies were changed') return True if self._force: self.log_repro_reason(u'it was forced') return True if not self.is_cache_exists(): self.log_repro_reason(u'cache file is missing.') return True if self.were_sources_changed(self._globally_changed_files): self.log_repro_reason(u'sources were changed') return True return False
def import_file(self, input, output, is_reproducible): if not CmdDataImport.is_url(input): if not os.path.exists(input): raise DataImportError( 'Input file "{}" does not exist'.format(input)) if not os.path.isfile(input): raise DataImportError( 'Input file "{}" has to be a regular file'.format(input)) if os.path.isdir(output): output = os.path.join(output, os.path.basename(input)) data_item = self.settings.path_factory.data_item(output) if os.path.exists(data_item.data.relative): raise DataImportError('Output file "{}" already exists'.format( data_item.data.relative)) if not os.path.isdir(os.path.dirname(data_item.data.relative)): raise DataImportError( 'Output file directory "{}" does not exists'.format( os.path.dirname(data_item.data.relative))) cache_dir = os.path.dirname(data_item.cache.relative) if not os.path.exists(cache_dir): os.makedirs(cache_dir) if CmdDataImport.is_url(input): Logger.debug('Downloading file {} ...'.format(input)) self.download_file(input, data_item.cache.relative) Logger.debug('Input file "{}" was downloaded to cache "{}"'.format( input, data_item.cache.relative)) else: copyfile(input, data_item.cache.relative) Logger.debug('Input file "{}" was copied to cache "{}"'.format( input, data_item.cache.relative)) data_item.create_symlink() Logger.debug( 'Symlink from data file "{}" to the cache file "{}" was created'. format(data_item.data.relative, data_item.cache.relative)) state_file = StateFile(data_item.state.relative, self.git, [], [output], [], is_reproducible) state_file.save() Logger.debug('State file "{}" was created'.format( data_item.state.relative)) pass
def cache(self): cache_dir = self.cache_dir_abs if self._cache_file: file_name = os.path.relpath(os.path.realpath(self._cache_file), cache_dir) else: file_name = str(StateFile.find_md5(self)) cache_file = os.path.join(cache_dir, file_name) return Path(cache_file, self._git)
def cache(self): cache_dir = self.cache_dir if self._cache_file: file_name = os.path.relpath(os.path.realpath(self._cache_file), cache_dir) else: from dvc.state_file import StateFile file_name = StateFile.load(self, self._git).md5 cache_file = os.path.join(cache_dir, file_name) return Path(cache_file, self._git)
def reproduce_dep(self, path, md5, recursive): if not self.settings.path_factory.is_data_item(path): if md5 != file_md5(os.path.join(self.git.git_dir_abs, path))[0]: self.log_repro_reason('source {} was changed'.format(path)) return True return False stage = StateFile.find_by_output(self.settings, path) if recursive: ReproStage(self.settings, stage, self._recursive, self._force).reproduce() stage = StateFile.load(stage.path) if md5 != stage.out[os.path.relpath(path, stage.cwd)]: self.log_repro_reason( 'data item {} was changed - md5 sum doesn\'t match'.format( path)) return True return False
def run(self): clist = [str(x) for x in StateFile.find_all_cache_files(self.git)] for cache in os.listdir(ConfigI.CACHE_DIR): fname = os.path.join(ConfigI.CACHE_DIR, cache) if os.path.basename(fname) in clist: continue os.remove(fname) Logger.info('Cache \'{}\' was removed'.format(fname)) return 0
def checkout_targets(self, targets): items = [] for fname in targets: self.git.checkout_file_before_last_merge(fname) state = StateFile.load(fname) for out in state.out: item = self.settings.path_factory.data_item(os.path.join(state.cwd, out)) items.append(item) CmdCheckout.checkout(items) msg = 'DVC merge files: {}'.format(' '.join(targets)) self.commit_if_needed(msg)
def create_state_files(self, targets): """ Create state files for all targets. """ for data_item in targets: Logger.debug('Creating state file for {}'.format( data_item.data.relative)) fname = os.path.basename(data_item.data.relative + StateFile.STATE_FILE_SUFFIX) out = StateFile.parse_deps_state(self.settings, [data_item.data.relative], currdir=os.path.curdir) state_file = StateFile(fname=fname, cmd=None, out=out, out_git=[], deps=[], locked=True) state_file.save() Logger.debug('State file "{}" was created'.format( data_item.state.relative))
def create_empty_file(self): empty_data_path = os.path.join(self.parsed_args.data_dir, self.EMPTY_FILE_NAME) data_item = self.settings.path_factory.data_item(empty_data_path) open(empty_data_path, 'w').close() data_item.move_data_to_cache() StateFile(StateFile.COMMAND_EMPTY_FILE, data_item, self.settings, input_files=[], output_files=[], lock=False).save(is_update_target_metrics=False) pass
def create_state_files(self, targets, lock): """ Create state files for all targets. """ for t in targets: input = t[0] output = t[1] data_item = t[2] Logger.debug('Creating symlink {} --> {}'.format( data_item.symlink_file, data_item.data.relative)) System.symlink(data_item.symlink_file, data_item.data.relative) state_file = StateFile(StateFile.COMMAND_IMPORT_FILE, data_item.state.relative, self.settings, argv=[input, output], input_files=[], output_files=[output], lock=lock) state_file.save() Logger.debug('State file "{}" was created'.format( data_item.state.relative))
def create_empty_file(self): empty_data_path = os.path.join(self.parsed_args.data_dir, self.EMPTY_FILE_NAME) cache_file_suffix = self.EMPTY_FILE_NAME + '_' + self.EMPTY_FILE_CHECKSUM empty_cache_path = os.path.join(Config.CONFIG_DIR, Config.CACHE_DIR, cache_file_suffix) open(empty_cache_path, 'w').close() System.symlink(os.path.join('..', empty_cache_path), empty_data_path) StateFile(StateFile.COMMAND_EMPTY_FILE, self.settings.path_factory.data_item(empty_data_path), self.settings, input_files=[], output_files=[], lock=False).save(is_update_target_metrics=False) pass
def process_file(self, target): data_item = self._get_data_item(target) name = data_item.data.relative state = StateFile.load(data_item.state.relative, self.git) self.g.add_node(name) for i in state.input_files: self.g.add_node(i) self.g.add_edge(i, name) for o in state.output_files: if o == name: continue self.g.add_node(o) self.g.add_edge(name, o)
def create_empty_file(self): empty_data_path = os.path.join(self.parsed_args.data_dir, self.EMPTY_FILE_NAME) cache_file_suffix = self.EMPTY_FILE_NAME + '_' + self.EMPTY_FILE_CHECKSUM empty_cache_path = os.path.join(self.parsed_args.cache_dir, cache_file_suffix) empty_state_path = os.path.join(self.parsed_args.state_dir, self.EMPTY_FILE_NAME + '.state') open(empty_cache_path, 'w').close() System.symlink(os.path.join('..', empty_cache_path), empty_data_path) StateFile(StateFile.COMMAND_EMPTY_FILE, empty_state_path, self.settings, input_files=[], output_files=[], lock=False).save() pass
def _read_metric_from_state_file(self, hash, target, settings): try: data_item = settings.path_factory.data_item(target) except DataItemError as ex: Logger.warn('Target file {} is not data item: {}'.format(target, ex)) return None try: cmd_corresponded_state_file = ['git', 'show', '{}:{}'.format(hash, data_item.state.relative)] state_file_content = Executor.exec_cmd_only_success(cmd_corresponded_state_file) except ExecutorError as ex: msg = '[dvc-git] Cannot obtain content of target symbolic file {} with hash {}: {}' Logger.warn(msg.format(target, hash, ex)) return None state_file = StateFile.loads(state_file_content, settings) return state_file.single_target_metric
def run(self): cmd = ' '.join(self.parsed_args.command) stage_file = self.get_stage_file() if os.path.isfile(stage_file): Logger.error("Stage file {} already exists".format(stage_file)) return 1 state = StateFile(fname=os.path.join(self.parsed_args.cwd, stage_file), cmd=cmd, out=self.parsed_args.out, out_git=self.parsed_args.out_git, deps=self.parsed_args.deps, locked=self.parsed_args.lock, cwd=self.parsed_args.cwd) self.run_command(self.settings, state) return self.commit_if_needed('DVC run: {}'.format(state.cmd))
def __init__(self, data_item, cmd_obj): self._data_item = data_item self.git = cmd_obj.git self._cmd_obj = cmd_obj self._state = StateFile.load(data_item.state.relative, self.git) cmd_obj._code = self.state.code_dependencies argv = self.state.norm_argv if not argv: raise ReproError( 'Error: parameter {} is nor defined in state file "{}"'.format( StateFile.PARAM_NORM_ARGV, data_item.state.relative)) if len(argv) < 2: raise ReproError( 'Error: reproducible cmd in state file "{}" is too short'. format(self.state.file)) self._repro_argv = argv pass
def is_data_item(self, fname): return StateFile.find_by_output(self._settings, fname) != None
def state(self): return Path(StateFile.find(self).path, self._git)
def all_existing_data_items(self, subdir='.', cache_exists=True): files = StateFile.find_all_data_files(self._git, subdir) return self.to_data_items(files)[0]