Exemplo n.º 1
0
    def create_state_files(self, targets, lock):
        """
        Create state files for all targets.
        """
        for t in targets:
            orig_target, processed_data_item = t
            input, data_item = orig_target
            output = data_item.data.relative

            if processed_data_item == None:
                Logger.debug(
                    'Skipping creating state file for failed import {}'.format(
                        data_item.state.relative))
                continue

            Logger.debug('Creating symlink {} --> {}'.format(
                data_item.symlink_file, data_item.data.relative))
            System.symlink(data_item.symlink_file, data_item.data.relative)

            state_file = StateFile(StateFile.COMMAND_IMPORT_FILE,
                                   data_item,
                                   self.settings,
                                   argv=[input, output],
                                   input_files=[],
                                   output_files=[output],
                                   lock=lock)
            state_file.save()
            Logger.debug('State file "{}" was created'.format(
                data_item.state.relative))
Exemplo n.º 2
0
    def run_command(self,
                    cmd_args,
                    data_items_from_args,
                    not_data_items_from_args,
                    stdout=None,
                    stderr=None,
                    shell=False):
        Logger.debug(
            'Run command with args: {}. Data items from args: {}. stdout={}, stderr={}, shell={}'
            .format(' '.join(cmd_args),
                    ', '.join([x.data.dvc for x in data_items_from_args]),
                    stdout, stderr, shell))

        repo_change = RepositoryChange(cmd_args,
                                       self.settings,
                                       stdout,
                                       stderr,
                                       shell=shell)

        if not self.no_git_actions and not self._validate_file_states(
                repo_change):
            self.remove_new_files(repo_change)
            raise RunError('Errors occurred.')

        output_set = set(self.declaration_output_data_items +
                         repo_change.changed_data_items)
        output_files_dvc = [x.data.dvc for x in output_set]

        input_set = set(data_items_from_args +
                        self.declaration_input_data_items) - output_set
        input_files_dvc = [x.data.dvc for x in input_set]

        code_dependencies_dvc = self.git.abs_paths_to_dvc(
            self.code_dependencies + not_data_items_from_args)

        result = []
        for data_item in repo_change.changed_data_items:
            Logger.debug(
                'Move output file "{}" to cache dir "{}" and create a symlink'.
                format(data_item.data.relative, data_item.cache.relative))
            data_item.move_data_to_cache()

            Logger.debug('Create state file "{}"'.format(
                data_item.state.relative))

            state_file = StateFile(StateFile.COMMAND_RUN,
                                   data_item.state.relative,
                                   self.settings,
                                   input_files_dvc,
                                   output_files_dvc,
                                   code_dependencies_dvc,
                                   argv=cmd_args,
                                   lock=self.lock,
                                   stdout=self._stdout_to_dvc(stdout),
                                   stderr=self._stdout_to_dvc(stderr),
                                   shell=shell)
            state_file.save()
            result.append(state_file)

        return result
Exemplo n.º 3
0
 def update_state_file(settings, state):
     Logger.debug('Update state file "{}"'.format(state.path))
     state.out = StateFile.parse_deps_state(settings,
                                            state.out,
                                            currdir=state.cwd)
     state.out_git = StateFile.parse_deps_state(settings,
                                                state.out_git,
                                                currdir=state.cwd)
     state.deps = StateFile.parse_deps_state(settings,
                                             state.deps,
                                             currdir=state.cwd)
     state.save()
Exemplo n.º 4
0
    def collect_targets(self):
        targets = []

        for fname in self.git.get_last_merge_changed_files():
            if not StateFile._is_state_file(fname):
                continue

            state = StateFile.load(fname)
            if not state.cmd and state.locked:
                targets.append(fname)

        return targets
Exemplo n.º 5
0
    def import_file(self, input, output, lock=False):
        if not CmdImportFile.is_url(input):
            if not os.path.exists(input):
                raise ImportFileError(
                    'Input file "{}" does not exist'.format(input))
            if not os.path.isfile(input):
                raise ImportFileError(
                    'Input file "{}" has to be a regular file'.format(input))

        if os.path.isdir(output):
            output = os.path.join(output, os.path.basename(input))

        data_item = self.settings.path_factory.data_item(output)

        if os.path.exists(data_item.data.relative):
            raise ImportFileError('Output file "{}" already exists'.format(
                data_item.data.relative))
        if not os.path.isdir(os.path.dirname(data_item.data.relative)):
            raise ImportFileError(
                'Output file directory "{}" does not exists'.format(
                    os.path.dirname(data_item.data.relative)))

        cache_dir = os.path.dirname(data_item.cache.relative)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)

        if CmdImportFile.is_url(input):
            Logger.debug('Downloading file {} ...'.format(input))
            self.download_file(input, data_item.cache.relative)
            Logger.debug('Input file "{}" was downloaded to cache "{}"'.format(
                input, data_item.cache.relative))
        else:
            copyfile(input, data_item.cache.relative)
            Logger.debug('Input file "{}" was copied to cache "{}"'.format(
                input, data_item.cache.relative))

        Logger.debug('Creating symlink {} --> {}'.format(
            data_item.symlink_file, data_item.data.relative))
        System.symlink(data_item.symlink_file, data_item.data.relative)

        state_file = StateFile(StateFile.COMMAND_IMPORT_FILE,
                               data_item.state.relative,
                               self.settings,
                               argv=[input, output],
                               input_files=[],
                               output_files=[output],
                               lock=lock)
        state_file.save()
        Logger.debug('State file "{}" was created'.format(
            data_item.state.relative))
        pass
Exemplo n.º 6
0
    def run(self):
        recursive = not self.parsed_args.single_item
        stages = []

        for target in self.parsed_args.targets:
            if StateFile._is_state_file(target):
                stage = StateFile.load(target)
            else:
                stage = StateFile.find_by_output(self.settings, target)

            if stage:
                stages.append(stage)

        self.repro_stages(stages, recursive, self.parsed_args.force)
        names = [os.path.relpath(stage.path) for stage in stages]
        return self.commit_if_needed('DVC repro: {}'.format(names))
Exemplo n.º 7
0
    def __init__(self, data_item, cmd_obj, globally_changed_files, recursive,
                 force):
        self._data_item = data_item
        self.git = cmd_obj.git
        self._cmd_obj = cmd_obj
        self._globally_changed_files = globally_changed_files
        self._recursive = recursive
        self._force = force

        if not System.islink(data_item.data.relative):
            raise ReproError('data item {} is not symlink'.format(
                data_item.data.relative))

        try:
            self._state = StateFile.load(data_item.state.relative, self.git)
        except Exception as ex:
            raise ReproError(
                'Error: state file "{}" cannot be loaded: {}'.format(
                    data_item.state.relative, ex))

        if not self.state.argv:
            raise ReproError(
                'Error: parameter {} is not defined in state file "{}"'.format(
                    StateFile.PARAM_ARGV, data_item.state.relative))
        if len(self.state.argv) < 1:
            raise ReproError(
                'Error: reproducible cmd in state file "{}" is too short'.
                format(self.state.file))

        self._settings = copy.copy(self._cmd_obj.settings)
        self._settings.set_args(self.state.argv)
        pass
Exemplo n.º 8
0
    def lock_files(self, files, target):
        cmd = 'lock' if target else 'unlock'

        error = 0
        for file in files:
            try:
                data_item = self.settings.path_factory.existing_data_item(file)
                state = StateFile.load(data_item.state.relative, self.settings)

                if state.locked and target:
                    Logger.warn('Data item {} is already locked'.format(
                        data_item.data.relative))
                elif not state.locked and not target:
                    Logger.warn('Data item {} is already unlocked'.format(
                        data_item.data.relative))
                else:
                    state.locked = target
                    Logger.debug('Saving status file for data item {}'.format(
                        data_item.data.relative))
                    state.save()
                    Logger.info('Data item {} was {}ed'.format(
                        data_item.data.relative, cmd))
            except Exception as ex:
                error += 1
                Logger.error('Unable to {} {}: {}'.format(cmd, file, ex))

        if error > 0 and not self.no_git_actions:
            Logger.error(
                'Errors occurred. One or more repro cmd was not successful.')
            self.not_committed_changes_warning()
        else:
            self.commit_if_needed('DVC lock: {}'.format(' '.join(self.args)))

        return 0
Exemplo n.º 9
0
    def is_repro_required(self, changed_files, data_item):
        state_file = StateFile.load(data_item.state.relative, self._settings)
        if state_file.locked:
            Logger.debug(
                u'Repro is not required for locked data item {}'.format(
                    data_item.data.relative))
            return False

        is_dependency_check_required = self._recursive

        if not is_dependency_check_required and not self.is_cache_exists():
            is_dependency_check_required = True
            Logger.info(
                u'Reproduction {}. Force dependency check since cache file is missing.'
                .format(self._data_item.data.relative))

        if is_dependency_check_required:
            if self.were_dependencies_changed(changed_files,
                                              data_item.data.dvc):
                self.log_repro_reason(u'input dependencies were changed')
                return True

        if self._force:
            self.log_repro_reason(u'it was forced')
            return True

        if not self.is_cache_exists():
            self.log_repro_reason(u'cache file is missing.')
            return True

        if self.were_sources_changed(self._globally_changed_files):
            self.log_repro_reason(u'sources were changed')
            return True

        return False
Exemplo n.º 10
0
    def import_file(self, input, output, is_reproducible):
        if not CmdDataImport.is_url(input):
            if not os.path.exists(input):
                raise DataImportError(
                    'Input file "{}" does not exist'.format(input))
            if not os.path.isfile(input):
                raise DataImportError(
                    'Input file "{}" has to be a regular file'.format(input))

        if os.path.isdir(output):
            output = os.path.join(output, os.path.basename(input))

        data_item = self.settings.path_factory.data_item(output)

        if os.path.exists(data_item.data.relative):
            raise DataImportError('Output file "{}" already exists'.format(
                data_item.data.relative))
        if not os.path.isdir(os.path.dirname(data_item.data.relative)):
            raise DataImportError(
                'Output file directory "{}" does not exists'.format(
                    os.path.dirname(data_item.data.relative)))

        cache_dir = os.path.dirname(data_item.cache.relative)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)

        if CmdDataImport.is_url(input):
            Logger.debug('Downloading file {} ...'.format(input))
            self.download_file(input, data_item.cache.relative)
            Logger.debug('Input file "{}" was downloaded to cache "{}"'.format(
                input, data_item.cache.relative))
        else:
            copyfile(input, data_item.cache.relative)
            Logger.debug('Input file "{}" was copied to cache "{}"'.format(
                input, data_item.cache.relative))

        data_item.create_symlink()
        Logger.debug(
            'Symlink from data file "{}" to the cache file "{}" was created'.
            format(data_item.data.relative, data_item.cache.relative))

        state_file = StateFile(data_item.state.relative, self.git, [],
                               [output], [], is_reproducible)
        state_file.save()
        Logger.debug('State file "{}" was created'.format(
            data_item.state.relative))
        pass
Exemplo n.º 11
0
    def cache(self):
        cache_dir = self.cache_dir_abs

        if self._cache_file:
            file_name = os.path.relpath(os.path.realpath(self._cache_file), cache_dir)
        else:
            file_name = str(StateFile.find_md5(self))

        cache_file = os.path.join(cache_dir, file_name)
        return Path(cache_file, self._git)
Exemplo n.º 12
0
    def cache(self):
        cache_dir = self.cache_dir

        if self._cache_file:
            file_name = os.path.relpath(os.path.realpath(self._cache_file), cache_dir)
        else:
            from dvc.state_file import StateFile
            file_name = StateFile.load(self, self._git).md5

        cache_file = os.path.join(cache_dir, file_name)
        return Path(cache_file, self._git)
Exemplo n.º 13
0
    def reproduce_dep(self, path, md5, recursive):
        if not self.settings.path_factory.is_data_item(path):
            if md5 != file_md5(os.path.join(self.git.git_dir_abs, path))[0]:
                self.log_repro_reason('source {} was changed'.format(path))
                return True
            return False

        stage = StateFile.find_by_output(self.settings, path)
        if recursive:
            ReproStage(self.settings, stage, self._recursive,
                       self._force).reproduce()

        stage = StateFile.load(stage.path)
        if md5 != stage.out[os.path.relpath(path, stage.cwd)]:
            self.log_repro_reason(
                'data item {} was changed - md5 sum doesn\'t match'.format(
                    path))
            return True

        return False
Exemplo n.º 14
0
    def run(self):
        clist = [str(x) for x in StateFile.find_all_cache_files(self.git)]

        for cache in os.listdir(ConfigI.CACHE_DIR):
            fname = os.path.join(ConfigI.CACHE_DIR, cache)
            if os.path.basename(fname) in clist:
                continue
            os.remove(fname)
            Logger.info('Cache \'{}\' was removed'.format(fname))

        return 0
Exemplo n.º 15
0
    def checkout_targets(self, targets):
        items = []
        for fname in targets:
            self.git.checkout_file_before_last_merge(fname)
            state = StateFile.load(fname)
            for out in state.out:
                item = self.settings.path_factory.data_item(os.path.join(state.cwd, out))
                items.append(item)

        CmdCheckout.checkout(items)

        msg = 'DVC merge files: {}'.format(' '.join(targets))
        self.commit_if_needed(msg)
Exemplo n.º 16
0
    def create_state_files(self, targets):
        """
        Create state files for all targets.
        """
        for data_item in targets:
            Logger.debug('Creating state file for {}'.format(
                data_item.data.relative))

            fname = os.path.basename(data_item.data.relative +
                                     StateFile.STATE_FILE_SUFFIX)
            out = StateFile.parse_deps_state(self.settings,
                                             [data_item.data.relative],
                                             currdir=os.path.curdir)
            state_file = StateFile(fname=fname,
                                   cmd=None,
                                   out=out,
                                   out_git=[],
                                   deps=[],
                                   locked=True)
            state_file.save()
            Logger.debug('State file "{}" was created'.format(
                data_item.state.relative))
Exemplo n.º 17
0
Arquivo: init.py Projeto: guptam/dvc
    def create_empty_file(self):
        empty_data_path = os.path.join(self.parsed_args.data_dir, self.EMPTY_FILE_NAME)

        data_item = self.settings.path_factory.data_item(empty_data_path)
        open(empty_data_path, 'w').close()
        data_item.move_data_to_cache()

        StateFile(StateFile.COMMAND_EMPTY_FILE,
                  data_item,
                  self.settings,
                  input_files=[],
                  output_files=[],
                  lock=False).save(is_update_target_metrics=False)
        pass
Exemplo n.º 18
0
    def create_state_files(self, targets, lock):
        """
        Create state files for all targets.
        """
        for t in targets:
            input = t[0]
            output = t[1]
            data_item = t[2]

            Logger.debug('Creating symlink {} --> {}'.format(
                data_item.symlink_file, data_item.data.relative))
            System.symlink(data_item.symlink_file, data_item.data.relative)

            state_file = StateFile(StateFile.COMMAND_IMPORT_FILE,
                                   data_item.state.relative,
                                   self.settings,
                                   argv=[input, output],
                                   input_files=[],
                                   output_files=[output],
                                   lock=lock)
            state_file.save()
            Logger.debug('State file "{}" was created'.format(
                data_item.state.relative))
Exemplo n.º 19
0
    def create_empty_file(self):
        empty_data_path = os.path.join(self.parsed_args.data_dir, self.EMPTY_FILE_NAME)
        cache_file_suffix = self.EMPTY_FILE_NAME + '_' + self.EMPTY_FILE_CHECKSUM
        empty_cache_path = os.path.join(Config.CONFIG_DIR, Config.CACHE_DIR, cache_file_suffix)

        open(empty_cache_path, 'w').close()
        System.symlink(os.path.join('..', empty_cache_path), empty_data_path)

        StateFile(StateFile.COMMAND_EMPTY_FILE,
                  self.settings.path_factory.data_item(empty_data_path),
                  self.settings,
                  input_files=[],
                  output_files=[],
                  lock=False).save(is_update_target_metrics=False)
        pass
Exemplo n.º 20
0
    def process_file(self, target):
        data_item = self._get_data_item(target)
        name = data_item.data.relative
        state = StateFile.load(data_item.state.relative, self.git)

        self.g.add_node(name)

        for i in state.input_files:
            self.g.add_node(i)
            self.g.add_edge(i, name)

        for o in state.output_files:
            if o == name:
                continue
            self.g.add_node(o)
            self.g.add_edge(name, o)
Exemplo n.º 21
0
    def create_empty_file(self):
        empty_data_path = os.path.join(self.parsed_args.data_dir, self.EMPTY_FILE_NAME)
        cache_file_suffix = self.EMPTY_FILE_NAME + '_' + self.EMPTY_FILE_CHECKSUM
        empty_cache_path = os.path.join(self.parsed_args.cache_dir, cache_file_suffix)
        empty_state_path = os.path.join(self.parsed_args.state_dir, self.EMPTY_FILE_NAME + '.state')

        open(empty_cache_path, 'w').close()
        System.symlink(os.path.join('..', empty_cache_path), empty_data_path)

        StateFile(StateFile.COMMAND_EMPTY_FILE,
                  empty_state_path,
                  self.settings,
                  input_files=[],
                  output_files=[],
                  lock=False).save()
        pass
Exemplo n.º 22
0
    def _read_metric_from_state_file(self, hash, target, settings):
        try:
            data_item = settings.path_factory.data_item(target)
        except DataItemError as ex:
            Logger.warn('Target file {} is not data item: {}'.format(target, ex))
            return None

        try:
            cmd_corresponded_state_file = ['git', 'show', '{}:{}'.format(hash, data_item.state.relative)]
            state_file_content = Executor.exec_cmd_only_success(cmd_corresponded_state_file)
        except ExecutorError as ex:
            msg = '[dvc-git] Cannot obtain content of target symbolic file {} with hash {}: {}'
            Logger.warn(msg.format(target, hash, ex))
            return None

        state_file = StateFile.loads(state_file_content, settings)
        return state_file.single_target_metric
Exemplo n.º 23
0
    def run(self):
        cmd = ' '.join(self.parsed_args.command)

        stage_file = self.get_stage_file()
        if os.path.isfile(stage_file):
            Logger.error("Stage file {} already exists".format(stage_file))
            return 1

        state = StateFile(fname=os.path.join(self.parsed_args.cwd, stage_file),
                          cmd=cmd,
                          out=self.parsed_args.out,
                          out_git=self.parsed_args.out_git,
                          deps=self.parsed_args.deps,
                          locked=self.parsed_args.lock,
                          cwd=self.parsed_args.cwd)

        self.run_command(self.settings, state)
        return self.commit_if_needed('DVC run: {}'.format(state.cmd))
Exemplo n.º 24
0
    def __init__(self, data_item, cmd_obj):
        self._data_item = data_item
        self.git = cmd_obj.git
        self._cmd_obj = cmd_obj
        self._state = StateFile.load(data_item.state.relative, self.git)

        cmd_obj._code = self.state.code_dependencies

        argv = self.state.norm_argv

        if not argv:
            raise ReproError(
                'Error: parameter {} is nor defined in state file "{}"'.format(
                    StateFile.PARAM_NORM_ARGV, data_item.state.relative))
        if len(argv) < 2:
            raise ReproError(
                'Error: reproducible cmd in state file "{}" is too short'.
                format(self.state.file))

        self._repro_argv = argv
        pass
Exemplo n.º 25
0
 def is_data_item(self, fname):
     return StateFile.find_by_output(self._settings, fname) != None
Exemplo n.º 26
0
 def state(self):
     return Path(StateFile.find(self).path, self._git)
Exemplo n.º 27
0
 def all_existing_data_items(self, subdir='.', cache_exists=True):
     files = StateFile.find_all_data_files(self._git, subdir)
     return self.to_data_items(files)[0]