Exemplo n.º 1
0
    def citations(self):
        files = []
        files.append(str(self.provenance_dir / 'citations.bib'))

        if (self.provenance_dir / 'artifacts').exists():
            for ancestor in (self.provenance_dir / 'artifacts').iterdir():
                if (ancestor / 'citations.bib').exists():
                    files.append(str(ancestor / 'citations.bib'))

        citations = Citations()
        for f in files:
            citations.update(Citations.load(f))

        return citations
Exemplo n.º 2
0
    def __init__(self):
        self.start = time.time()
        self.uuid = uuid.uuid4()
        self.end = None
        self.plugins = collections.OrderedDict()

        # For the purposes of this dict, `return` is a special case for output
        # we expect to transform this later when serializing, but this lets
        # us treat all transformations uniformly.
        self.transformers = collections.OrderedDict()
        self.citations = Citations()
        self._framework_citations = []

        for idx, citation in enumerate(qiime2.__citations__):
            citation_key = self.make_citation_key('framework')
            self.citations[citation_key.key] = citation
            self._framework_citations.append(citation_key)

        self._build_paths()
Exemplo n.º 3
0
    def __init__(self):
        self.start = time.time()
        self.uuid = uuid.uuid4()
        self.end = None
        self.plugins = collections.OrderedDict()

        # For the purposes of this dict, `return` is a special case for output
        # we expect to transform this later when serializing, but this lets
        # us treat all transformations uniformly.
        self.transformers = collections.OrderedDict()
        self.citations = Citations()
        self._framework_citations = []

        for idx, citation in enumerate(qiime2.__citations__):
            citation_key = self.make_citation_key('framework')
            self.citations[citation_key.key] = citation
            self._framework_citations.append(citation_key)

        self._build_paths()
Exemplo n.º 4
0
class ProvenanceCapture:
    ANCESTOR_DIR = 'artifacts'
    ACTION_DIR = 'action'
    ACTION_FILE = 'action.yaml'
    CITATION_FILE = 'citations.bib'

    def __init__(self):
        self.start = time.time()
        self.uuid = uuid.uuid4()
        self.end = None
        self.plugins = collections.OrderedDict()

        # For the purposes of this dict, `return` is a special case for output
        # we expect to transform this later when serializing, but this lets
        # us treat all transformations uniformly.
        self.transformers = collections.OrderedDict()
        self.citations = Citations()
        self._framework_citations = []

        for idx, citation in enumerate(qiime2.__citations__):
            citation_key = self.make_citation_key('framework')
            self.citations[citation_key.key] = citation
            self._framework_citations.append(citation_key)

        self._build_paths()

    @property
    def _destructor(self):
        return self.path._destructor

    def _build_paths(self):
        self.path = qiime2.core.path.ProvenancePath()

        self.ancestor_dir = self.path / self.ANCESTOR_DIR
        self.ancestor_dir.mkdir()

        self.action_dir = self.path / self.ACTION_DIR
        self.action_dir.mkdir()

    def add_ancestor(self, artifact):
        other_path = artifact._archiver.provenance_dir
        if other_path is None:
            # The artifact doesn't have provenance (e.g. version 0)
            # it would be possible to invent a metadata.yaml, but we won't know
            # the framework version for the VERSION file. Even if we did
            # it won't accomplish a lot and there shouldn't be enough
            # version 0 artifacts in the wild to be important in practice.
            # NOTE: this implies that it is possible for an action.yaml file to
            # contain an artifact UUID that is not in the artifacts/ directory.
            return NoProvenance(artifact.uuid)

        destination = self.ancestor_dir / str(artifact.uuid)
        # If it exists, then the artifact is already in the provenance
        # (and so are its ancestors)
        if not destination.exists():
            # Handle root node of ancestor
            shutil.copytree(str(other_path),
                            str(destination),
                            ignore=shutil.ignore_patterns(self.ANCESTOR_DIR +
                                                          '*'))

            # Handle ancestral nodes of ancestor
            grandcestor_path = other_path / self.ANCESTOR_DIR
            if grandcestor_path.exists():
                for grandcestor in grandcestor_path.iterdir():
                    destination = self.ancestor_dir / grandcestor.name
                    if not destination.exists():
                        shutil.copytree(str(grandcestor), str(destination))

        return str(artifact.uuid)

    def make_citation_key(self,
                          domain,
                          package=None,
                          identifier=None,
                          index=0):
        if domain == 'framework':
            package, version = 'qiime2', qiime2.__version__
        else:
            package, version = package.name, package.version
        id_block = [] if identifier is None else [identifier]

        return CitationKey('|'.join([domain, package + ':' + version] +
                                    id_block + [str(index)]))

    def make_software_entry(self, version, website, citations=()):
        entry = collections.OrderedDict()

        entry['version'] = version
        entry['website'] = website
        if citations:
            entry['citations'] = citations

        return entry

    def reference_plugin(self, plugin):
        plugin_citations = []
        for idx, citation in enumerate(plugin.citations):
            citation_key = self.make_citation_key('plugin', plugin, index=idx)
            self.citations[citation_key.key] = citation
            plugin_citations.append(citation_key)

        self.plugins[plugin.name] = self.make_software_entry(
            plugin.version, plugin.website, plugin_citations)

        return ForwardRef('environment:plugins:' + plugin.name)

    def capture_env(self):
        return collections.OrderedDict(
            (d.project_name, d.version) for d in pkg_resources.working_set)

    def transformation_recorder(self, name):
        section = self.transformers[name] = []

        def recorder(transformer_record, input_name, input_record, output_name,
                     output_record):
            entry = collections.OrderedDict()
            entry['from'] = input_name
            entry['to'] = output_name
            citation_keys = []

            if transformer_record is not None:
                plugin = transformer_record.plugin
                entry['plugin'] = self.reference_plugin(plugin)

                for idx, citation in enumerate(transformer_record.citations):
                    citation_key = self.make_citation_key(
                        'transformer', plugin,
                        '%s->%s' % (input_name, output_name), idx)
                    self.citations[citation_key.key] = citation
                    citation_keys.append(citation_key)

            records = []
            if input_record is not None:
                records.append(input_record)
            if output_record is not None:
                records.append(output_record)
            for record in records:
                self.reference_plugin(record.plugin)
                for idx, citation in enumerate(record.citations):
                    citation_key = self.make_citation_key(
                        'view', record.plugin, record.name, idx)
                    self.citations[citation_key.key] = citation
                    citation_keys.append(citation_key)

            if citation_keys:
                entry['citations'] = citation_keys
            section.append(entry)

        return recorder

    def _ts_to_date(self, ts):
        return datetime.fromtimestamp(ts, tzlocal.get_localzone())

    def make_execution_section(self):
        execution = collections.OrderedDict()
        execution['uuid'] = str(self.uuid)
        execution['runtime'] = runtime = collections.OrderedDict()
        runtime['start'] = start = self._ts_to_date(self.start)
        runtime['end'] = end = self._ts_to_date(self.end)
        runtime['duration'] = \
            util.duration_time(relativedelta.relativedelta(end, start))

        return execution

    def make_transformers_section(self):
        transformers = collections.OrderedDict()
        data = self.transformers.copy()
        output = data.pop('return', None)
        if data:
            transformers['inputs'] = data
        if output is not None:
            transformers['output'] = output
        return transformers

    def make_env_section(self):
        env = collections.OrderedDict()
        env['platform'] = pkg_resources.get_build_platform()
        # There is a trailing whitespace in sys.version, strip so that YAML can
        # use literal formatting.
        env['python'] = LiteralString('\n'.join(
            line.strip() for line in sys.version.split('\n')))
        env['framework'] = self.make_software_entry(qiime2.__version__,
                                                    qiime2.__website__,
                                                    self._framework_citations)
        env['plugins'] = self.plugins
        env['python-packages'] = self.capture_env()

        return env

    def write_action_yaml(self):
        settings = dict(default_flow_style=False, indent=4)
        with (self.action_dir / self.ACTION_FILE).open(mode='w') as fh:
            fh.write(
                yaml.dump({'execution': self.make_execution_section()},
                          **settings))
            fh.write('\n')
            fh.write(
                yaml.dump({'action': self.make_action_section()}, **settings))
            if self.transformers:  # pipelines don't have these
                fh.write('\n')
                fh.write(
                    yaml.dump(
                        {'transformers': self.make_transformers_section()},
                        **settings))
            fh.write('\n')
            fh.write(
                yaml.dump({'environment': self.make_env_section()},
                          **settings))

    def write_citations_bib(self):
        self.citations.save(str(self.path / self.CITATION_FILE))

    def finalize(self, final_path, node_members):
        self.end = time.time()

        for member in node_members:
            shutil.copy(str(member), str(self.path))

        self.write_action_yaml()
        self.write_citations_bib()

        self.path.rename(final_path)

    def fork(self):
        forked = copy.copy(self)
        # Unique state for each output of an action
        forked.plugins = forked.plugins.copy()
        forked.transformers = forked.transformers.copy()
        forked.citations = forked.citations.copy()
        # create a copy of the backing dir so factory (the hard stuff is
        # mostly done by this point)
        forked._build_paths()
        distutils.dir_util.copy_tree(str(self.path), str(forked.path))

        return forked
Exemplo n.º 5
0
class ProvenanceCapture:
    ANCESTOR_DIR = 'artifacts'
    ACTION_DIR = 'action'
    ACTION_FILE = 'action.yaml'
    CITATION_FILE = 'citations.bib'

    def __init__(self):
        self.start = time.time()
        self.uuid = uuid.uuid4()
        self.end = None
        self.plugins = collections.OrderedDict()

        # For the purposes of this dict, `return` is a special case for output
        # we expect to transform this later when serializing, but this lets
        # us treat all transformations uniformly.
        self.transformers = collections.OrderedDict()
        self.citations = Citations()
        self._framework_citations = []

        for idx, citation in enumerate(qiime2.__citations__):
            citation_key = self.make_citation_key('framework')
            self.citations[citation_key.key] = citation
            self._framework_citations.append(citation_key)

        self._build_paths()

    @property
    def _destructor(self):
        return self.path._destructor

    def _build_paths(self):
        self.path = qiime2.core.path.ProvenancePath()

        self.ancestor_dir = self.path / self.ANCESTOR_DIR
        self.ancestor_dir.mkdir()

        self.action_dir = self.path / self.ACTION_DIR
        self.action_dir.mkdir()

    def add_ancestor(self, artifact):
        other_path = artifact._archiver.provenance_dir
        if other_path is None:
            # The artifact doesn't have provenance (e.g. version 0)
            # it would be possible to invent a metadata.yaml, but we won't know
            # the framework version for the VERSION file. Even if we did
            # it won't accomplish a lot and there shouldn't be enough
            # version 0 artifacts in the wild to be important in practice.
            # NOTE: this implies that it is possible for an action.yaml file to
            # contain an artifact UUID that is not in the artifacts/ directory.
            return NoProvenance(artifact.uuid)

        destination = self.ancestor_dir / str(artifact.uuid)
        # If it exists, then the artifact is already in the provenance
        # (and so are its ancestors)
        if not destination.exists():
            # Handle root node of ancestor
            shutil.copytree(
                str(other_path), str(destination),
                ignore=shutil.ignore_patterns(self.ANCESTOR_DIR + '*'))

            # Handle ancestral nodes of ancestor
            grandcestor_path = other_path / self.ANCESTOR_DIR
            if grandcestor_path.exists():
                for grandcestor in grandcestor_path.iterdir():
                    destination = self.ancestor_dir / grandcestor.name
                    if not destination.exists():
                        shutil.copytree(str(grandcestor), str(destination))

        return str(artifact.uuid)

    def make_citation_key(self, domain, package=None, identifier=None,
                          index=0):
        if domain == 'framework':
            package, version = 'qiime2', qiime2.__version__
        else:
            package, version = package.name, package.version
        id_block = [] if identifier is None else [identifier]

        return CitationKey('|'.join(
            [domain, package + ':' + version] + id_block + [str(index)]))

    def make_software_entry(self, version, website, citations=()):
        entry = collections.OrderedDict()

        entry['version'] = version
        entry['website'] = website
        if citations:
            entry['citations'] = citations

        return entry

    def reference_plugin(self, plugin):
        plugin_citations = []
        for idx, citation in enumerate(plugin.citations):
            citation_key = self.make_citation_key('plugin', plugin, index=idx)
            self.citations[citation_key.key] = citation
            plugin_citations.append(citation_key)

        self.plugins[plugin.name] = self.make_software_entry(
            plugin.version, plugin.website, plugin_citations)

        return ForwardRef('environment:plugins:' + plugin.name)

    def capture_env(self):
        return collections.OrderedDict(
            (d.project_name, d.version) for d in pkg_resources.working_set)

    def transformation_recorder(self, name):
        section = self.transformers[name] = []

        def recorder(transformer_record, input_name, input_record, output_name,
                     output_record):
            entry = collections.OrderedDict()
            entry['from'] = input_name
            entry['to'] = output_name
            citation_keys = []

            if transformer_record is not None:
                plugin = transformer_record.plugin
                entry['plugin'] = self.reference_plugin(plugin)

                for idx, citation in enumerate(transformer_record.citations):
                    citation_key = self.make_citation_key(
                        'transformer', plugin,
                        '%s->%s' % (input_name, output_name), idx)
                    self.citations[citation_key.key] = citation
                    citation_keys.append(citation_key)

            records = []
            if input_record is not None:
                records.append(input_record)
            if output_record is not None:
                records.append(output_record)
            for record in records:
                self.reference_plugin(record.plugin)
                for idx, citation in enumerate(record.citations):
                    citation_key = self.make_citation_key(
                        'view', record.plugin, record.name, idx)
                    self.citations[citation_key.key] = citation
                    citation_keys.append(citation_key)

            if citation_keys:
                entry['citations'] = citation_keys
            section.append(entry)

        return recorder

    def _ts_to_date(self, ts):
        return datetime.fromtimestamp(ts, tzlocal.get_localzone())

    def make_execution_section(self):
        execution = collections.OrderedDict()
        execution['uuid'] = str(self.uuid)
        execution['runtime'] = runtime = collections.OrderedDict()
        runtime['start'] = start = self._ts_to_date(self.start)
        runtime['end'] = end = self._ts_to_date(self.end)
        runtime['duration'] = \
            util.duration_time(relativedelta.relativedelta(end, start))

        return execution

    def make_transformers_section(self):
        transformers = collections.OrderedDict()
        data = self.transformers.copy()
        output = data.pop('return', None)
        if data:
            transformers['inputs'] = data
        if output is not None:
            transformers['output'] = output
        return transformers

    def make_env_section(self):
        env = collections.OrderedDict()
        env['platform'] = pkg_resources.get_build_platform()
        # There is a trailing whitespace in sys.version, strip so that YAML can
        # use literal formatting.
        env['python'] = LiteralString('\n'.join(line.strip() for line in
                                      sys.version.split('\n')))
        env['framework'] = self.make_software_entry(
            qiime2.__version__, qiime2.__website__, self._framework_citations)
        env['plugins'] = self.plugins
        env['python-packages'] = self.capture_env()

        return env

    def write_action_yaml(self):
        settings = dict(default_flow_style=False, indent=4)
        with (self.action_dir / self.ACTION_FILE).open(mode='w') as fh:
            fh.write(yaml.dump({'execution': self.make_execution_section()},
                               **settings))
            fh.write('\n')
            fh.write(yaml.dump({'action': self.make_action_section()},
                               **settings))
            if self.transformers:  # pipelines don't have these
                fh.write('\n')
                fh.write(yaml.dump(
                    {'transformers': self.make_transformers_section()},
                    **settings))
            fh.write('\n')
            fh.write(yaml.dump({'environment': self.make_env_section()},
                               **settings))

    def write_citations_bib(self):
        self.citations.save(str(self.path / self.CITATION_FILE))

    def finalize(self, final_path, node_members):
        self.end = time.time()

        for member in node_members:
            shutil.copy(str(member), str(self.path))

        self.write_action_yaml()
        self.write_citations_bib()

        self.path.rename(final_path)

    def fork(self):
        forked = copy.copy(self)
        # Unique state for each output of an action
        forked.plugins = forked.plugins.copy()
        forked.transformers = forked.transformers.copy()
        forked.citations = forked.citations.copy()
        # create a copy of the backing dir so factory (the hard stuff is
        # mostly done by this point)
        forked._build_paths()
        distutils.dir_util.copy_tree(str(self.path), str(forked.path))

        return forked