Exemplo n.º 1
0
class EnvCreateArgs(RelatedConfigMixin):
    model = StrSequenceField(str, required=True)
    source = related.StringField(required=True)
    dataloader = StrSequenceField(str, default=[], required=False)
    env = related.StringField(default=None, required=False)
    gpu = related.BooleanField(default=False, required=False)
    tmpdir = related.StringField(default=None, required=False)
    vep = related.BooleanField(default=False, required=False)
Exemplo n.º 2
0
class DataLoaderArgument(RelatedConfigMixin):
    # MAYBE - make this a general argument class
    doc = related.StringField("", required=False)
    example = AnyField(required=False)
    name = related.StringField(required=False)
    type = related.StringField(default='str', required=False)
    optional = related.BooleanField(default=False, required=False)
    tags = StrSequenceField(str, default=[],
                            required=False)  # TODO - restrict the tags

    def __attrs_post_init__(self):
        if self.doc == "":
            logger.warn("doc empty for one of the dataloader `args` fields")
Exemplo n.º 3
0
class Info(RelatedConfigMixin):
    """Class holding information about the component.
    Parses the info section in component.yaml:

    info:
      authors:
        - name: Ziga Avsec
      doc: RBP binding prediction
      name: rbp_eclip
      version: 0.1
    """
    authors = related.SequenceField(Author, repr=True, required=False)
    doc = related.StringField("", required=False)  # free-text description of the model
    name = related.StringField(required=False)  # TODO - deprecate
    version = related.StringField(default="0.1", required=False)
    license = related.StringField(default="MIT", required=False)  # license of the model/dataloader - defaults to MIT
    tags = StrSequenceField(str, default=[], required=False)

    def __attrs_post_init__(self):
        if self.authors and self.doc == "":
            logger.warn("doc empty for the `info:` field")
Exemplo n.º 4
0
class ArraySchema(RelatedConfigMixin):
    """

    Args:
      shape: Tuple of shape (same as in Keras for the input)
      doc: Description of the array
      special_type: str, special type name. Could also be an array of special entries?
      metadata_entries: str or list of metadata
    """
    verbose = True
    shape = TupleIntField()
    doc = related.StringField("", required=False)
    # MAYBE - allow a list of strings?
    #         - could be useful when a single array can have multiple 'attributes'
    name = related.StringField(required=False)
    special_type = related.ChildField(ArraySpecialType, required=False)
    associated_metadata = StrSequenceField(str, default=[], required=False)
    column_labels = StrSequenceField(
        str, default=[], required=False
    )  # either a list or a path to a file --> need to check whether it's a list

    # TODO shall we have
    # - associated_metadata in ArraySchema
    # OR
    # - associated_array in MetadataField?

    # assert that there are no Nones in the shape, assume that channels is the only 4 or it is the last
    # update the model schema shape on calling batch_iter method
    # overwrite the batch_iter method of the returned dataloader --> decorator needed

    def print_msg(self, msg):
        if self.verbose:
            print("ArraySchema mismatch")
            print(msg)

    def _validate_list_column_labels(self):
        dim_ok = len(self.shape) >= 1
        if dim_ok and (self.shape[0] is not None):
            dim_ok &= len(self.column_labels) == self.shape[0]
        if not dim_ok:
            self.print_msg(
                "Column annotation does not match array dimension with shape %s and %d labels (%s ...)"
                % (str(self.shape), len(
                    self.column_labels), str(self.column_labels)[:30]))

    def __attrs_post_init__(self):
        if len(self.column_labels) > 1:
            # check that length is ok with columns
            self._validate_list_column_labels()
        elif len(self.column_labels) == 1:
            label = self.column_labels.list[0]
            import os
            # check if path exists raise exception only test time, but only a warning in prediction time
            if os.path.exists(label):
                with open(label, "r") as ifh:
                    object.__setattr__(self, "column_labels",
                                       [l.rstrip() for l in ifh])
            self._validate_list_column_labels()
        else:
            object.__setattr__(self, "column_labels", None)

    def compatible_with_batch(self, batch, verbose=True):
        """Checks compatibility with a particular batch of data

        Args:
          batch: numpy array
          ignore_batch_axis: if True, the batch axis is not considered
          verbose: print the fail reason
        """
        def print_msg(msg):
            if verbose:
                print("ArraySchema mismatch")
                print(msg)

        # type = np.ndarray
        if not isinstance(batch, np.ndarray):
            print_msg("Expecting a np.ndarray. Got type(batch) = {0}".format(
                type(batch)))
            return False

        if not batch.ndim >= 1:
            print_msg(
                "The array is a scalar (expecting at least the batch dimension)"
            )
            return False

        return self.compatible_with_schema(
            ArraySchema(shape=batch.shape[1:], doc=""))

    def compatible_with_schema(self,
                               schema,
                               name_self="",
                               name_schema="",
                               verbose=True):
        """Checks the compatibility with another schema

        Args:
          schema: Other ArraySchema
          name_self: How to call self in the error messages
          name_schema: analogously to name_self for the schema ArraySchema
          verbose: bool, describe what went wrong through print()
        """
        def print_msg(msg):
            if verbose:
                # print("ArraySchema mismatch")
                print(msg)

        if not isinstance(schema, ArraySchema):
            print_msg(
                "Expecting ArraySchema. Got type({0} schema) = {1}".format(
                    name_schema, type(schema)))
            return False

        def print_msg_template():
            print("ArraySchema mismatch")
            print("Array shapes don't match for the fields:")
            print("--")
            print(name_self)
            print("--")
            print(self.get_config_as_yaml())
            print("--")
            print(name_schema)
            print("--")
            print(schema.get_config_as_yaml())
            print("--")
            print(
                "Provided shape (without the batch axis): {0}, expected shape: {1} "
                .format(bshape, self.shape))

        bshape = schema.shape
        if not len(bshape) == len(self.shape):
            print_msg_template()
            return False
        for i in range(len(bshape)):
            if bshape[i] is not None and self.shape[i] is not None:
                # shapes don't match
                if not bshape[i] == self.shape[i]:
                    print_msg_template()
                    return False
        return True
Exemplo n.º 5
0
class Dependencies(RelatedConfigMixin):
    conda = StrSequenceField(str, default=[], required=False, repr=True)
    pip = StrSequenceField(str, default=[], required=False, repr=True)
    # not really required
    conda_channels = related.SequenceField(str,
                                           default=["defaults"],
                                           required=False,
                                           repr=True)

    def __attrs_post_init__(self):
        """
        In case conda or pip are filenames pointing to existing files,
        read the files and populate the package names
        """
        if len(self.conda) == 1 and self.conda[0].endswith(".txt") and \
           os.path.exists(self.conda[0]):
            # found a conda txt file
            object.__setattr__(self, "conda", read_txt(self.conda[0]))

        if len(self.pip) == 1 and self.pip[0].endswith(".txt") and \
           os.path.exists(self.pip[0]):
            # found a pip txt file
            object.__setattr__(self, "pip", read_txt(self.pip[0]))

    def install_pip(self, dry_run=False):
        print("pip dependencies to be installed:")
        print(self.pip)
        if dry_run:
            return
        else:
            kconda.install_pip(self.pip)

    def install_conda(self, dry_run=False):
        print("Conda dependencies to be installed:")
        print(self.conda)
        if dry_run:
            return
        else:
            channels, packages = self._get_channels_packages()
            kconda.install_conda(packages, channels)

    def install(self, dry_run=False):
        self.install_conda(dry_run)
        self.install_pip(dry_run)

    def merge(self, dependencies):
        """Merge one dependencies with another one

        Use case: merging the dependencies of model and dataloader

        Args:
          dependencies: Dependencies instance

        Returns:
          new Dependencies instance
        """
        return Dependencies(
            conda=unique_list(list(self.conda) + list(dependencies.conda)),
            pip=kconda.normalize_pip(list(self.pip) + list(dependencies.pip)),
            conda_channels=unique_list(
                list(self.conda_channels) + list(dependencies.conda_channels)))

    def normalized(self):
        """Normalize the list of dependencies
        """
        channels, packages = self._get_channels_packages()
        if isinstance(packages, related.types.TypedSequence):
            packages = packages.list
        if isinstance(channels, related.types.TypedSequence):
            channels = channels.list

        return Dependencies(conda=packages,
                            pip=kconda.normalize_pip(list(self.pip)),
                            conda_channels=channels)

    def _get_channels_packages(self):
        """Get conda channels and packages separated from each other (by '::')
        """
        if len(self.conda) == 0:
            return self.conda_channels, self.conda
        channels, packages = list(
            zip(*map(kconda.parse_conda_package, self.conda)))
        channels = unique_list(list(channels) + list(self.conda_channels))
        packages = unique_list(list(packages))
        return channels, packages

    def to_env_dict(self, env_name):
        deps = self.normalized()
        channels, packages = deps._get_channels_packages()
        if isinstance(packages, related.types.TypedSequence):
            packages = packages.list
        if isinstance(channels, related.types.TypedSequence):
            channels = channels.list

        env_dict = OrderedDict(
            name=env_name,
            channels=channels,
            dependencies=packages +
            [OrderedDict(pip=kconda.normalize_pip(deps.pip))])
        return env_dict

    def to_env_file(self, env_name, path):
        """Dump the dependencies to a file
        """
        with open(path, 'w') as f:
            d = self.to_env_dict(env_name)

            # add python if not present
            add_py = True
            for dep in d['dependencies']:
                if isinstance(dep, str) and dep.startswith("python"):
                    add_py = False

            if add_py:
                d['dependencies'] = ["python"] + d['dependencies']
            # -----
            # remove fields that are empty
            out = []
            for k in d:
                if not (isinstance(d[k], list) and len(d[k]) == 0):
                    out.append((k, d[k]))
            # -----

            f.write(
                yaml_ordered_dump(OrderedDict(out),
                                  indent=2,
                                  default_flow_style=False))

    def gpu(self):
        """Get the gpu-version of the dependencies
        """
        def replace_gpu(dep):
            if dep.startswith("tensorflow") and "gpu" not in dep:
                new_dep = dep.replace("tensorflow", "tensorflow-gpu")
                logger.info(
                    "use gpu: Replacing the dependency {0} with {1}".format(
                        dep, new_dep))
                return new_dep
            if dep.startswith("pytorch-cpu"):
                new_dep = dep.replace("pytorch-cpu", "pytorch")
                logger.info(
                    "use gpu: Replacing the dependency {0} with {1}".format(
                        dep, new_dep))
                return new_dep
            return dep

        deps = self.normalized()
        return Dependencies(conda=[replace_gpu(dep) for dep in deps.conda],
                            pip=[replace_gpu(dep) for dep in deps.pip],
                            conda_channels=deps.conda_channels)
Exemplo n.º 6
0
class Dependencies(RelatedConfigMixin):
    conda = StrSequenceField(str, default=[], required=False, repr=True)
    pip = StrSequenceField(str, default=[], required=False, repr=True)
    # not really required
    conda_channels = related.SequenceField(str,
                                           default=["defaults"],
                                           required=False,
                                           repr=True)

    def __attrs_post_init__(self):
        """
        In case conda or pip are filenames pointing to existing files,
        read the files and populate the package names
        """
        if len(self.conda) == 1 and self.conda[0].endswith(".txt") and \
           os.path.exists(self.conda[0]):
            # found a conda txt file
            object.__setattr__(self, "conda", read_txt(self.conda[0]))

        if len(self.pip) == 1 and self.pip[0].endswith(".txt") and \
           os.path.exists(self.pip[0]):
            # found a pip txt file
            object.__setattr__(self, "pip", read_txt(self.pip[0]))

    def all_installed(self, verbose=False):
        """Validate if all the dependencies are installed as requested

        Args:
          verbose: if True, display warnings if the dependencies are not installed

        Returns:
          (bool): True if all the required package versions are installed
            and False otherwise
        """
        norm = self.normalized()
        for pkg in list(norm.conda) + list(norm.pip):
            if not kconda.is_installed(pkg):
                if verbose:
                    pkg_name, req_version = kconda.version_split(pkg)
                    found_version = kconda.get_package_version(pkg_name)
                    if found_version is None:
                        print("Package '{}' is not installed".format(pkg_name))
                    else:
                        print("Installed package '{}={}' doesn't "
                              "comply with '{}'".format(
                                  pkg_name, found_version, pkg))
                return False
        return True

    def install_pip(self, dry_run=False):
        print("pip dependencies to be installed:")
        print(self.pip)
        if dry_run:
            return
        else:
            kconda.install_pip(self.pip)

    def install_conda(self, dry_run=False):
        print("Conda dependencies to be installed:")
        print(self.conda)
        if dry_run:
            return
        else:
            channels, packages = self._get_channels_packages()
            kconda.install_conda(packages, channels)

    def install(self, dry_run=False):
        self.install_conda(dry_run)
        self.install_pip(dry_run)

    def merge(self, dependencies):
        """Merge one dependencies with another one

        Use case: merging the dependencies of model and dataloader

        Args:
            dependencies: Dependencies instance

        Returns:
            new Dependencies instance
        """
        return Dependencies(
            conda=unique_list(list(self.conda) + list(dependencies.conda)),
            pip=kconda.normalize_pip(list(self.pip) + list(dependencies.pip)),
            conda_channels=unique_list(
                list(self.conda_channels) + list(dependencies.conda_channels)))

    def normalized(self):
        """Normalize the list of dependencies
        """
        channels, packages = self._get_channels_packages()
        if isinstance(packages, related.types.TypedSequence):
            packages = packages.list
        if isinstance(channels, related.types.TypedSequence):
            channels = channels.list

        return Dependencies(conda=packages,
                            pip=kconda.normalize_pip(list(self.pip)),
                            conda_channels=channels)

    def _get_channels_packages(self):
        """Get conda channels and packages separated from each other(by '::')
        """
        if len(self.conda) == 0:
            return self.conda_channels, self.conda
        channels, packages = list(
            zip(*map(kconda.parse_conda_package, self.conda)))
        channels = unique_list(list(channels) + list(self.conda_channels))
        packages = unique_list(list(packages))

        # Handle channel order
        if "bioconda" in channels and "conda-forge" not in channels:
            # Insert 'conda-forge' right after bioconda if it is not included
            channels.insert(channels.index("bioconda") + 1, "conda-forge")
        if "pysam" in packages and "bioconda" in channels:
            if channels.index("defaults") < channels.index("bioconda"):
                logger.warn(
                    "Swapping channel order - putting defaults last. " +
                    "Using pysam bioconda instead of anaconda")
                channels.remove("defaults")
                channels.insert(len(channels), "defaults")
        return channels, packages

    def to_env_dict(self, env_name):
        deps = self.normalized()
        channels, packages = deps._get_channels_packages()
        if isinstance(packages, related.types.TypedSequence):
            packages = packages.list
        if isinstance(channels, related.types.TypedSequence):
            channels = channels.list

        env_dict = OrderedDict(
            name=env_name,
            channels=channels,
            dependencies=packages +
            [OrderedDict(pip=kconda.normalize_pip(deps.pip))])
        return env_dict

    @classmethod
    def from_env_dict(self, dict):
        cfg = {}
        cfg["conda_channels"] = dict['channels']
        cfg["conda"] = [
            el for el in dict['dependencies']
            if not isinstance(el, OrderedDict)
        ]
        pip = [
            el for el in dict['dependencies'] if isinstance(el, OrderedDict)
        ]
        if len(pip) == 1:
            cfg["pip"] = pip[0]['pip']
        elif len(pip) > 1:
            raise Exception("Malformatted conda environment yaml!")
        return self.from_config(cfg)

    def to_env_file(self, env_name, path):
        """Dump the dependencies to a file
        """
        with open(path, 'w') as f:
            d = self.to_env_dict(env_name)

            # add python if not present
            add_py = True
            for dep in d['dependencies']:
                if isinstance(dep, str) and dep.startswith("python"):
                    add_py = False

            if add_py:
                d['dependencies'] = ["python"] + d['dependencies']
            # -----
            # remove fields that are empty
            out = []
            for k in d:
                if not (isinstance(d[k], list) and len(d[k]) == 0):
                    out.append((k, d[k]))
            # -----

            f.write(
                yaml_ordered_dump(OrderedDict(out),
                                  indent=2,
                                  default_flow_style=False))

    def gpu(self):
        """Get the gpu - version of the dependencies
        """
        def replace_gpu(dep):
            if dep.startswith("tensorflow") and "gpu" not in dep:
                new_dep = dep.replace("tensorflow", "tensorflow-gpu")
                logger.info(
                    "use gpu: Replacing the dependency {0} with {1}".format(
                        dep, new_dep))
                return new_dep
            if dep.startswith("pytorch-cpu"):
                new_dep = dep.replace("pytorch-cpu", "pytorch")
                logger.info(
                    "use gpu: Replacing the dependency {0} with {1}".format(
                        dep, new_dep))
                return new_dep
            return dep

        deps = self.normalized()
        return Dependencies(conda=[replace_gpu(dep) for dep in deps.conda],
                            pip=[replace_gpu(dep) for dep in deps.pip],
                            conda_channels=deps.conda_channels)

    def osx(self):
        """Get the os - x compatible dependencies
        """
        from sys import platform
        if platform != 'darwin':
            logger.warn(
                "Calling osx dependency conversion on non-osx platform: {}".
                format(platform))

        def replace_osx(dep):
            if dep.startswith("pytorch-cpu"):
                new_dep = dep.replace("pytorch-cpu", "pytorch")
                logger.info(
                    "osx: Replacing the dependency {0} with {1}".format(
                        dep, new_dep))
                return new_dep
            return dep

        deps = self.normalized()
        return Dependencies(conda=[replace_osx(dep) for dep in deps.conda],
                            pip=[replace_osx(dep) for dep in deps.pip],
                            conda_channels=deps.conda_channels)