Exemplo n.º 1
0
def test_sequential_model_loading():
    m2 = kipoi.get_model("example/models/extended_coda", source='dir')
    m1 = kipoi.get_model("example/models/kipoi_dataloader_decorator",
                         source='dir')

    with cd(m2.source_dir):
        next(m2.default_dataloader.init_example().batch_iter())
    with cd(m1.source_dir):
        next(m1.default_dataloader.init_example().batch_iter())
Exemplo n.º 2
0
def test_predict_pipeline():
    model = kipoi.get_model("Basset", source="kipoi")
    dl_kwargs = model.default_dataloader.example_kwargs
    with cd(model.source_dir):
        ret = model.pipeline.predict(dl_kwargs)
    assert isinstance(ret, np.ndarray)
    with cd(model.source_dir):
        ret = model.pipeline.predict(dl_kwargs, layer="11")
    assert isinstance(ret, list)
    # with a model that does not implement LayerActivationMixin it should fail:
    hal_model = kipoi.get_model("HAL", source="kipoi")
    hal_dl_kwargs = hal_model.default_dataloader.example_kwargs
    with pytest.raises(Exception):
        ret = model.pipeline.predict(hal_dl_kwargs, layer="11")
Exemplo n.º 3
0
def get_example_data(example, layer, writer=None):
    example_dir = "examples/{0}".format(example)
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(example_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    with open(example_dir + "/example_files/test.json", "r") as ifh:
        dataloader_arguments = json.load(ifh)

    for k in dataloader_arguments:
        dataloader_arguments[k] = "example_files/" + dataloader_arguments[k]

    outputs = []
    with cd(model.source_dir):
        dl = Dataloader(**dataloader_arguments)
        it = dl.batch_iter(batch_size=32, num_workers=0)

        # Loop through the data, make predictions, save the output
        for i, batch in enumerate(tqdm(it)):

            # make the prediction
            pred_batch = model.input_grad(batch['inputs'], avg_func="sum", layer=layer,
                                          final_layer=False)
            # write out the predictions, metadata (, inputs, targets)
            # always keep the inputs so that input*grad can be generated!
            output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
            if writer is not None:
                writer.batch_write(output_batch)
            outputs.append(output_batch)
        if writer is not None:
            writer.close()
    return numpy_collate(outputs)
Exemplo n.º 4
0
def test_loading():
    model_path = "example/models/pyt/model_files/"
    model_path_class_model = "example/models/pyt_class/"
    # load model and weights explcitly
    with pytest.raises(Exception):
        m1 = PyTorchModel(weights=model_path + "only_weights.pth")
        m1 = PyTorchModel(module_file=model_path + "pyt.py",
                          weights=model_path + "only_weights.pth")
    with cd(model_path):
        m1 = PyTorchModel(module_obj="pyt.simple_model",
                          weights="only_weights.pth")
    m1 = PyTorchModel(module_file=model_path + "pyt.py",
                      weights=model_path + "only_weights.pth",
                      module_obj="simple_model")
    m1 = PyTorchModel(module_file=THISFILE,
                      weights=PYT_NET_MODEL_WEIGHTS_FILE,
                      module_class="PyTNet")
    m1 = PyTorchModel(module_file=THISFILE,
                      weights=PYT_NET_MODEL_WEIGHTS_FILE,
                      module_class="PyTNet",
                      module_kwargs={})
    m1 = PyTorchModel(module_file=THISFILE,
                      weights=PYT_NET_MODEL_WEIGHTS_FILE,
                      module_class="PyTNet",
                      module_kwargs="{}")
    # test loading class from full yaml definition with module_kargs:
    mh = kipoi.get_model(model_path_class_model, "dir")
    # Load the test files from model source
    mh.pipeline.predict_example(batch_size=3)
Exemplo n.º 5
0
def test_predict_to_file(tmpdir):
    h5_tmpfile = str(tmpdir.mkdir("example").join("out.h5"))
    model = kipoi.get_model("Basset", source="kipoi")
    dl_kwargs = model.default_dataloader.example_kwargs
    with cd(model.source_dir):
        model.pipeline.predict_to_file(h5_tmpfile, dl_kwargs)
    preds = kipoi.readers.HDF5Reader.load(h5_tmpfile)
    assert 'preds' in preds
Exemplo n.º 6
0
def test_gradient_pipeline():
    model = kipoi.get_model("Basset", source="kipoi")
    dl_kwargs = model.default_dataloader.example_kwargs
    with cd(model.source_dir):
        ret = model.pipeline.input_grad(dl_kwargs,
                                        final_layer=True,
                                        avg_func="sum")
    assert all(k in ret for k in ['targets', 'metadata', 'inputs', 'grads'])
Exemplo n.º 7
0
    def __init__(self, data, model, source="kipoi", grad_preds=None):
        """
        Arguments:
            data: model input data batch 
            model: model name as used for running `model.input_grad(...)`
            source: model source as used for running `model.input_grad(...)`
            grad_preds: return value of `model.input_grad(...)`. Can alternatively already be present in `data`
            argument under the key `preds`. In that case `grad_preds` may be None.
        """
        self.data = data
        if grad_preds is not None:
            self.data['grads'] = grad_preds
        else:
            assert 'grads' in self.data

        # TODO: Instead of copying from kipoi.model should we rather have a get_model_descr
        # TODO-cont: funcion that is also called from get_model
        # Taken from get_model
        source_name = source
        source = kipoi.config.get_source(source)
        md = source.get_model_descr(model)

        if ":" in md.default_dataloader:
            dl_source, dl_path = md.default_dataloader.split(":")
        else:
            dl_source = source_name
            dl_path = md.default_dataloader

        # allow to use relative and absolute paths for referring to the dataloader
        default_dataloader_path = os.path.join("/" + model, dl_path)[1:]
        # This one loads the model!!
        # default_dataloader = kipoi.get_dataloader_factory(default_dataloader_path,
        #                                                  dl_source)

        # TODO: Is there a nicer way of getting ahold of the dataloader description?
        yaml_path = source.pull_dataloader(default_dataloader_path)
        dataloader_dir = os.path.dirname(yaml_path)
        from kipoi.components import DataLoaderDescription
        with cd(dataloader_dir):
            dl = DataLoaderDescription.load(os.path.basename(yaml_path))
            default_dataloader = dl

        try:
            self.mie = ModelInfoExtractor(md, default_dataloader)
        except:
            logger.warn(
                "Model is not enabled for variant effect prediction hence it is unclear whether there is a DNA "
                "sequence input, so (automatic) seqlogo plots are not available for this model."
            )
            self.mie = None
        self.md = md
        self.dataloader = default_dataloader

        # how can the correct model input be selected
        self.get_dataset, self.model_input_keylist = self._get_ds_extractor(
            md.schema.inputs)
Exemplo n.º 8
0
def get_dataloader_descr(model_name, source):
    from kipoi.utils import cd
    src = kipoi.get_source(source)
    md = kipoi.get_model_descr(model_name, source=source)
    if isinstance(md.default_dataloader, str):
        dl_path = os.path.join(model_name, md.default_dataloader)
        return kipoi.get_dataloader_descr(dl_path, source=source)
    else:
        with cd(src.get_model_dir(model_name)):
            return md.default_dataloader.get()
Exemplo n.º 9
0
def test_mutation_map():
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")

    # Take the rbp model
    model_dir = "examples/rbp/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    #
    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
    }
    dataloader_arguments = {
        k: model_dir + v
        for k, v in dataloader_arguments.items()
    }
    #
    # Run the actual predictions
    vcf_path = model_dir + "example_files/first_variant.vcf"
    #
    model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
        model, Dataloader)
    vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg(
        model_info)
    mdmm = mm._generate_mutation_map(
        model,
        Dataloader,
        vcf_path,
        dataloader_args=dataloader_arguments,
        evaluation_function=analyse_model_preds,
        batch_size=32,
        vcf_to_region=vcf_to_region,
        evaluation_function_kwargs={'diff_types': {
            'diff': Diff("mean")
        }})
    with cd(model.source_dir):
        mdmm.save_to_file("example_files/first_variant_mm_totest.hdf5")
        from kipoi.postprocessing.variant_effects.utils.generic import read_hdf5
        reference = read_hdf5("example_files/first_variant_mm.hdf5")
        obs = read_hdf5("example_files/first_variant_mm.hdf5")
        compare_rec(reference[0], obs[0])
        import matplotlib
        matplotlib.pyplot.switch_backend('agg')
        mdmm.plot_mutmap(0, "seq", "diff", "rbp_prb")
        os.unlink("example_files/first_variant_mm_totest.hdf5")
Exemplo n.º 10
0
def test_var_eff_pred_varseq():
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    model_dir = "examples/var_seqlen_model/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)
    #
    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    #
    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_centered_intervals.tsv"
    }
    vcf_path = "example_files/variants.vcf"
    out_vcf_fpath = "example_files/variants_generated.vcf"
    ref_out_vcf_fpath = "example_files/variants_ref_out.vcf"
    #
    with cd(model.source_dir):
        vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
            vcf_path)
        model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
            model, Dataloader)
        writer = kipoi.postprocessing.variant_effects.VcfWriter(
            model, vcf_path, out_vcf_fpath)
        vcf_to_region = None
        with pytest.raises(Exception):
            # This has to raise an exception as the sequence length is None.
            vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg(
                model_info)
        res = sp.predict_snvs(
            model,
            Dataloader,
            vcf_path,
            dataloader_args=dataloader_arguments,
            evaluation_function=analyse_model_preds,
            batch_size=32,
            vcf_to_region=vcf_to_region,
            evaluation_function_kwargs={'diff_types': {
                'diff': Diff("mean")
            }},
            sync_pred_writer=writer)
        writer.close()
        # pass
        # assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath)
        compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath)
        os.unlink(out_vcf_fpath)
Exemplo n.º 11
0
def get_dataloader_factory(dataloader):

    # pull the dataloader & get the dataloader directory
    yaml_path = './model/dataloader.yaml'
    dataloader_dir = './model/'

    # --------------------------------------------
    # Setup dataloader description
    with cd(dataloader_dir):  # move to the dataloader directory temporarily
        dl = DataLoaderDescription.load(os.path.basename(yaml_path))
        file_path, obj_name = tuple(dl.defined_as.split("::"))
        CustomDataLoader = getattr(load_module(file_path), obj_name)

    # check that dl.type is correct
    if dl.type not in AVAILABLE_DATALOADERS:
        raise ValueError(
            "dataloader type: {0} is not in supported dataloaders:{1}".format(
                dl.type, list(AVAILABLE_DATALOADERS.keys())))
    # check that the extractor arguments match yaml arguments
    if not getargs(CustomDataLoader) == set(dl.args.keys()):
        raise ValueError("DataLoader arguments: \n{0}\n don't match ".format(
            set(getargs(CustomDataLoader))) +
                         "the specification in the dataloader.yaml file:\n{0}".
                         format(set(dl.args.keys())))
    # check that CustomDataLoader indeed interits from the right DataLoader
    if dl.type in DATALOADERS_AS_FUNCTIONS:
        # transform the functions into objects
        assert isinstance(CustomDataLoader, types.FunctionType)
        CustomDataLoader = AVAILABLE_DATALOADERS[dl.type].from_fn(
            CustomDataLoader)
    else:
        if not issubclass(CustomDataLoader, AVAILABLE_DATALOADERS[dl.type]):
            raise ValueError(
                "DataLoader does't inherit from the specified dataloader: {0}".
                format(AVAILABLE_DATALOADERS[dl.type].__name__))

    # Inherit the attributes from dl
    CustomDataLoader.type = dl.type
    CustomDataLoader.defined_as = dl.defined_as
    CustomDataLoader.args = dl.args
    CustomDataLoader.info = dl.info
    CustomDataLoader.output_schema = dl.output_schema
    CustomDataLoader.dependencies = dl.dependencies
    CustomDataLoader.postprocessing = dl.postprocessing
    CustomDataLoader._yaml_path = yaml_path
    CustomDataLoader.source_dir = dataloader_dir
    #CustomDataLoader.print_args = classmethod(print_dl_kwargs)

    return CustomDataLoader
Exemplo n.º 12
0
def test_var_eff_pred2():
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    # Take the rbp model
    model_dir = "examples/rbp/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)
    #
    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    #
    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
    }
    #
    # Run the actual predictions
    vcf_path = "example_files/variants.vcf"
    out_vcf_fpath = "example_files/variants_generated2.vcf"
    ref_out_vcf_fpath = "example_files/variants_ref_out2.vcf"
    restricted_regions_fpath = "example_files/restricted_regions.bed"
    #
    with cd(model.source_dir):
        pbd = pb.BedTool(restricted_regions_fpath)
        model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
            model, Dataloader)
        vcf_to_region = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg(
            model_info, pbd)
        writer = kipoi.postprocessing.variant_effects.utils.io.VcfWriter(
            model, vcf_path, out_vcf_fpath)
        res = sp.predict_snvs(
            model,
            Dataloader,
            vcf_path,
            dataloader_args=dataloader_arguments,
            evaluation_function=analyse_model_preds,
            batch_size=32,
            vcf_to_region=vcf_to_region,
            evaluation_function_kwargs={'diff_types': {
                'diff': Diff("mean")
            }},
            sync_pred_writer=writer)
        writer.close()
        # pass
        #assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath)
        compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath)
        os.unlink(out_vcf_fpath)
Exemplo n.º 13
0
def test_gradient_function_model(example):
    """Test extractor
    """
    if example == "rbp" and sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")

    import keras
    backend = keras.backend._BACKEND
    if backend == 'theano' and example == "rbp":
        pytest.skip("extended_coda example not with theano ")
    #
    example_dir = "examples/{0}".format(example)
    # install the dependencies
    # - TODO maybe put it implicitly in load_dataloader?
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)
    #
    Dl = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    test_kwargs = get_test_kwargs(example_dir)
    #
    # install the dependencies
    # - TODO maybe put it implicitly in load_extractor?
    if INSTALL_REQ:
        install_model_requirements(example_dir, source="dir")
    #
    # get model
    model = kipoi.get_model(example_dir, source="dir")
    #
    with cd(example_dir + "/example_files"):
        # initialize the dataloader
        dataloader = Dl(**test_kwargs)
        #
        # sample a batch of data
        it = dataloader.batch_iter()
        batch = next(it)
        # predict with a model
        model.predict_on_batch(batch["inputs"])
        if backend != 'theano':
            model.input_grad(batch["inputs"],
                             Slice_conv()[:, 0],
                             pre_nonlinearity=True)
        model.input_grad(batch["inputs"],
                         Slice_conv()[:, 0],
                         pre_nonlinearity=False)
        model.input_grad(batch["inputs"], 0,
                         pre_nonlinearity=False)  # same as Slice_conv()[:, 0]
        model.input_grad(batch["inputs"], avg_func="sum")
Exemplo n.º 14
0
def get_dataloader_descr(model_name, source='kipoi'):
    """Not yet nicely integrated with Kipoi

    Args:
      model_name: model name as a string

    Returns:
      (model output schema, list of required files)
    """
    dl_skip_arguments = {
        "kipoiseq.dataloaders.SeqIntervalDl":
        ['alphabet_axis', 'dummy_axis', 'alphabet', 'dtype']
    }
    md = kipoi.get_model_descr(model_name)
    src = kipoi.get_source(source)

    # get dataloader
    if isinstance(md.default_dataloader, str):
        dataloader = kipoi.get_dataloader_descr(os.path.join(
            model_name, md.default_dataloader),
                                                source=source)
        dataloader_name = md.default_dataloader
        dataloader_args = dataloader.args
    else:
        with cd(src.get_model_dir(model_name)):
            dataloader = md.default_dataloader.get()
        dataloader_name = md.default_dataloader.defined_as
        dataloader_args = OrderedDict([
            (k, v) for k, v in dataloader.args.items()
            if k not in list(md.default_dataloader.default_args) +
            dl_skip_arguments.get(dataloader_name, [])
        ])

        if md.default_dataloader.defined_as == 'kipoiseq.dataloaders.SeqIntervalDl':
            # HACK - cleanup some values for SeqIntervalDl
            if md.default_dataloader.default_args.get("ignore_targets", False):
                dataloader_args.pop('label_dtype', None)

    required_files = []
    if 'fasta_file' in dataloader.args:
        required_files.append("fasta_file")
    if 'gtf_file' in dataloader.args:
        required_files.append("gtf_file")

    return get_output_schema(md.schema.targets), required_files
Exemplo n.º 15
0
def test_activation_function_model(example):
    """Test extractor
    """
    if example == "rbp" and sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    #
    import keras
    backend = keras.backend._BACKEND
    if backend == 'theano' and example == "rbp":
        pytest.skip("extended_coda example not with theano ")
    #
    example_dir = "examples/{0}".format(example)
    # install the dependencies
    # - TODO maybe put it implicitly in load_dataloader?
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)
    #
    Dl = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    test_kwargs = get_test_kwargs(example_dir)
    #
    # install the dependencies
    # - TODO maybe put it implicitly in load_extractor?
    if INSTALL_REQ:
        install_model_requirements(example_dir, source="dir")
    #
    # get model
    model = kipoi.get_model(example_dir, source="dir")
    #
    with cd(example_dir + "/example_files"):
        # initialize the dataloader
        dataloader = Dl(**test_kwargs)
        #
        # sample a batch of data
        it = dataloader.batch_iter()
        batch = next(it)
        # predict with a model
        model.predict_on_batch(batch["inputs"])
        model.predict_activation_on_batch(batch["inputs"],
                                          layer=len(model.model.layers) - 2)
        if example == "rbp":
            model.predict_activation_on_batch(batch["inputs"],
                                              layer="flatten_6")
Exemplo n.º 16
0
def test_loading_old(tmpdir):
    import torch
    # load model in different ways...
    with pytest.raises(Exception):
        OldPyTorchModel()
    OldPyTorchModel(build_fn=lambda: get_simple_model())
    model_path = "example/models/pyt/model_files/"
    # load model and weights explcitly
    m1 = OldPyTorchModel(file=model_path + "pyt.py",
                         weights=model_path + "only_weights.pth",
                         build_fn="get_model")
    # load model and weights through model loader
    with cd("example/models/pyt"):
        m2 = OldPyTorchModel(file="model_files/pyt.py",
                             build_fn="get_model_w_weights")
    # assert that's identical
    check_same_weights(m1.model.state_dict(), m2.model.state_dict())
    # now test whether loading a full model works
    tmpfile = str(tmpdir.mkdir("pytorch").join("full_model.pth"))
    m = get_simple_model()
    torch.save(m, tmpfile)
    km = OldPyTorchModel(weights=tmpfile)
    check_same_weights(m.state_dict(), km.model.state_dict())
Exemplo n.º 17
0
def modified_files(git_range, source_folder, relative=True):
    """
    Returns files under the models dir that have been modified within the git
    range. Filenames are returned with the `source_folder` included.

    Args:
      git_range : list or tuple of length 1 or 2
          For example, ['00232ffe', '10fab113'], or commonly ['master', 'HEAD']
          or ['master']. If length 2, then the commits are provided to `git diff`
          using the triple-dot syntax, `commit1...commit2`. If length 1, the
          comparison is any changes in the working tree relative to the commit.
      source_folder : str
          Root of the model source/git repo
      relative=True: return the relative path
    """
    assert isinstance(git_range, list)
    cmds = ['diff', '--name-only'] + git_range

    with cd(source_folder):
        code, lines = _call_command("git", cmds, use_stdout=True,
                                    return_logs_with_stdout=True)

    assert code == 0
    modified = [os.path.join(source_folder, line)
                for line in lines]

    # exclude files that were deleted in the git-range
    existing = list(filter(os.path.exists, modified))

    # if the only diff is that files were deleted, we can have ['model/'], so
    # filter on existing *files*
    existing = list(filter(os.path.isfile, existing))
    if relative:
        return [os.path.relpath(f, source_folder)
                for f in existing]
    else:
        return existing
Exemplo n.º 18
0
def merge_deps(models, dataloaders=None, source="kipoi", vep=False, gpu=False):
    """Setup the dependencies
    """
    deps = Dependencies()
    for model in models:
        logger.info("Loading model: {0} description".format(model))

        parsed_source, parsed_model = parse_source_name(source, model)

        sub_models = list_subcomponents(parsed_model, parsed_source, "model")
        if len(sub_models) == 0:
            raise ValueError("Model {0} not found in source {1}".format(
                parsed_model, parsed_source))
        if len(sub_models) > 1:
            logger.info(
                "Found {0} models under the model name: {1}. Merging dependencies for all"
                .format(len(sub_models), parsed_model))

        for sub_model in sub_models:
            model_descr = kipoi.get_model_descr(sub_model, parsed_source)
            model_dir = kipoi.get_source(parsed_source).get_model_dir(
                sub_model)
            deps = deps.merge(model_descr.dependencies)

            # handle the dataloader=None case
            if dataloaders is None or not dataloaders:
                if isinstance(model_descr.default_dataloader,
                              DataLoaderImport):
                    # dataloader specified by the import
                    deps = deps.merge(
                        model_descr.default_dataloader.dependencies)
                    if model_descr.default_dataloader.parse_dependencies:
                        # add dependencies specified in the yaml file
                        # load from the dataloader description if you can
                        try:
                            with cd(model_dir):
                                dataloader_descr = model_descr.default_dataloader.get(
                                )
                            deps = deps.merge(dataloader_descr.dependencies)
                        except ImportError as e:
                            # package providing the dataloader is not installed yet
                            if model_descr.default_dataloader.defined_as.startswith(
                                    "kipoiseq."):
                                logger.info(
                                    "kipoiseq not installed. Using default kipoiseq dependencies for the dataloader: {}"
                                    .format(model_descr.default_dataloader.
                                            defined_as))
                                deps = deps.merge(KIPOISEQ_DEPS)
                            else:
                                logger.warn(
                                    "Unable to extract dataloader description. "
                                    "Make sure the package containing the dataloader `{}` is installed"
                                    .format(model_descr.default_dataloader.
                                            defined_as))
                else:
                    dataloader = os.path.normpath(
                        os.path.join(sub_model,
                                     str(model_descr.default_dataloader)))
                    logger.info("Inferred dataloader name: {0} from".format(
                        dataloader) + " the model.")
                    dataloader_descr = kipoi.get_dataloader_descr(
                        dataloader, parsed_source)
                    deps = deps.merge(dataloader_descr.dependencies)
    if dataloaders is not None or dataloaders:
        for dataloader in dataloaders:
            parsed_source, parsed_dataloader = parse_source_name(
                source, dataloader)
            sub_dataloaders = list_subcomponents(parsed_dataloader,
                                                 parsed_source, "dataloader")
            if len(sub_dataloaders) == 0:
                raise ValueError(
                    "Dataloader: {0} not found in source {1}".format(
                        parsed_dataloader, parsed_source))

            if len(sub_dataloaders) > 1:
                logger.info(
                    "Found {0} dataloaders under the dataloader name: {1}. Merging dependencies for all"
                    .format(len(sub_dataloaders), parsed_dataloader))
            for sub_dataloader in sub_dataloaders:
                dataloader_descr = kipoi.get_dataloader_descr(
                    sub_dataloader, parsed_source)
                deps = deps.merge(dataloader_descr.dependencies)

    # add Kipoi to the dependencies
    deps = KIPOI_DEPS.merge(deps)

    if vep:
        # add vep dependencies
        logger.info("Adding the vep dependencies")
        deps = VEP_DEPS.merge(deps)

    if gpu:
        logger.info("Using gpu-compatible dependencies")
        deps = deps.gpu()

    if platform == "darwin":
        logger.info("Using osx-type dependencies")
        deps = deps.osx()

    return deps
Exemplo n.º 19
0
def model_list(model_name):
    """ Models list view """
    from kipoi.utils import cd
    source = current_app.config['SOURCE']
    df = get_model_list(source)
    model_name = model_name.rstrip('/')
    vtype_path = get_view(model_name, df)

    if vtype_path is None:
        # run 404
        return
        # pass
    else:
        vtype, path = vtype_path

    # render the model detail view
    if vtype == "model":
        # Model info retrieved from kipoi
        model = kipoi.get_model_descr(model_name, source=source)
        src = kipoi.get_source(source)
        model_dir = kipoi.utils.relative_path(src.get_model_dir(model_name),
                                              src.local_path)
        model_url = github_dir_tree(src.remote_url, model_dir)
        # Model dataloaders info retrieved from kipoi
        if model.default_dataloader:
            if isinstance(model.default_dataloader, str):
                dl_rel_path = True
                dataloader = kipoi.get_dataloader_descr(os.path.join(
                    model_name, model.default_dataloader),
                                                        source=source)
                dataloader_name = model.default_dataloader
                dataloader_args = dataloader.args
            else:
                dl_rel_path = False
                with cd(src.get_model_dir(model_name)):
                    dataloader = model.default_dataloader.get()
                dataloader_name = model.default_dataloader.defined_as
                dataloader_args = OrderedDict([
                    (k, v) for k, v in dataloader.args.items()
                    if k not in list(model.default_dataloader.default_args) +
                    dl_skip_arguments.get(dataloader_name, [])
                ])

                if model.default_dataloader.defined_as == 'kipoiseq.dataloaders.SeqIntervalDl':
                    # HACK - cleanup some values for SeqIntervalDl
                    if model.default_dataloader.default_args.get(
                            "ignore_targets", False):
                        dataloader_args.pop('label_dtype', None)
        else:
            dataloader = None
            dataloader_name = ''
            dataloader_args = {}
            dl_rel_path = False

        title = model_name.split('/')
        # obtain snippets
        code_snippets = get_snippets(model_name, source)
        if model_name == "SeqVec/embedding2structure":
            code_snippets["docker"] = ''
            code_snippets["singularity"] = ''
            code_snippets["cli"] = ''
            code_snippets["python"] = ''
            code_snippets["R"] = ''

        # reading the README content
        readme_dir = kipoi.get_source(
            current_app.config['SOURCE']).get_model_dir(model_name)
        try:
            # python doesnt handle case sensetive path. so:
            filelists = os.listdir(readme_dir)
            readmeindx = [x.lower() for x in filelists].index("readme.md")
            filecontent = open(os.path.join(readme_dir, filelists[readmeindx]),
                               "r").read()
            readmecontent = render_markdown(filecontent)
            # remove the title because already there is a title
            readmecontent = re.sub("<[hH][12]>.*</[hH][12]>",
                                   "",
                                   readmecontent,
                                   count=1)
            readmecontent = Markup(readmecontent)
        except IOError:
            readmecontent = ""
        except ValueError:
            readmecontent = ""
        return render_template(
            "models/model_details.html",
            model_name=model_name,
            model=model,
            contributors=update_contributors(model.info.contributors,
                                             model.info.authors),
            authors=update_authors(model.info.authors, model.info.cite_as),
            dataloader=dataloader,
            dataloader_args=dataloader_args,
            dataloader_name=dataloader_name,
            model_url=model_url,
            dl_rel_path=dl_rel_path,
            cite_as=update_cite_as(model.info.cite_as),
            title=title,
            code_snippets=code_snippets,
            readmecontent=readmecontent,
            model_postprocessing=available_postprocessing(model_name))

    # run the normal model list view on a subsetted table
    elif vtype == "model_list":
        model_df = get_model_list(source)

        # TODO - augment the results

        # Filter the results
        model_df = model_df[model_df.model.str.contains("^" + path + "/")]

        filtered_models = model_df.to_dict(orient='records')
        filtered_models = [update_cite_as_dict(x) for x in filtered_models]

        # update contributors
        filtered_models = [
            update_contributors_as_dict(x) for x in filtered_models
        ]

        # update authors
        filtered_models = [update_authors_as_dict(x) for x in filtered_models]

        # get readme file
        readme_dir = os.path.join(
            kipoi.get_source(current_app.config['SOURCE']).local_path,
            model_name)
        try:
            filelists = os.listdir(readme_dir)
            readmeindx = [x.lower() for x in filelists].index("readme.md")
            filecontent = open(os.path.join(readme_dir, filelists[readmeindx]),
                               "r").read()
            readmecontent = render_markdown(filecontent)
        except IOError:
            readmecontent = ""
        except ValueError:
            readmecontent = ""

        return render_template("models/index.html",
                               models=filtered_models,
                               readmecontent=readmecontent)

    # redirect to the group list
    elif vtype == "group_list":
        return redirect(url_for('models.list_groups', group_name=path))
Exemplo n.º 20
0
Arquivo: env.py Projeto: dlhuang/kipoi
def merge_deps(models,
               dataloaders=None,
               source="kipoi",
               vep=False,
               interpret=False,
               gpu=False):
    """Setup the dependencies
    """

    special_envs, only_models = split_models_special_envs(models)
    deps = Dependencies()

    # Treat the handcrafted environments differently
    for special_env in special_envs:
        from related import from_yaml
        logger.info("Loading environment definition: {0}".format(special_env))

        # Load and merge the handcrafted deps.
        yaml_path = os.path.join(
            kipoi.get_source(source).local_path, special_env + ".yaml")

        if not os.path.exists(yaml_path):
            raise ValueError(
                "Environment definition file {0} not found in source {1}".
                format(yaml_path, source))

        with open(yaml_path, "r", encoding="utf-8") as fh:
            special_env_deps = Dependencies.from_env_dict(from_yaml(fh))
        deps = deps.merge(special_env_deps)

    for model in only_models:
        logger.info("Loading model: {0} description".format(model))

        parsed_source, parsed_model = parse_source_name(source, model)

        sub_models = list_subcomponents(parsed_model, parsed_source, "model")
        if len(sub_models) == 0:
            raise ValueError("Model {0} not found in source {1}".format(
                parsed_model, parsed_source))
        if len(sub_models) > 1:
            logger.info(
                "Found {0} models under the model name: {1}. Merging dependencies for all"
                .format(len(sub_models), parsed_model))

        for sub_model in sub_models:
            model_descr = kipoi.get_model_descr(sub_model, parsed_source)
            model_dir = kipoi.get_source(parsed_source).get_model_dir(
                sub_model)
            deps = deps.merge(model_descr.dependencies)

            # handle the dataloader=None case
            if dataloaders is None or not dataloaders:
                if isinstance(model_descr.default_dataloader,
                              DataLoaderImport):
                    # dataloader specified by the import
                    deps = deps.merge(
                        model_descr.default_dataloader.dependencies)
                    if model_descr.default_dataloader.parse_dependencies:
                        # add dependencies specified in the yaml file
                        # load from the dataloader description if you can
                        try:
                            with cd(model_dir):
                                dataloader_descr = model_descr.default_dataloader.get(
                                )
                            deps = deps.merge(dataloader_descr.dependencies)
                        except ImportError as e:
                            # package providing the dataloader is not installed yet
                            if model_descr.default_dataloader.defined_as.startswith(
                                    "kipoiseq."):
                                logger.info(
                                    "kipoiseq not installed. Using default kipoiseq dependencies for the dataloader: {}"
                                    .format(model_descr.default_dataloader.
                                            defined_as))
                                deps = deps.merge(KIPOISEQ_DEPS)
                            else:
                                logger.warn(
                                    "Unable to extract dataloader description. "
                                    "Make sure the package containing the dataloader `{}` is installed"
                                    .format(model_descr.default_dataloader.
                                            defined_as))
                else:
                    dataloader = os.path.normpath(
                        os.path.join(sub_model,
                                     str(model_descr.default_dataloader)))
                    logger.info("Inferred dataloader name: {0} from".format(
                        dataloader) + " the model.")
                    dataloader_descr = kipoi.get_dataloader_descr(
                        dataloader, parsed_source)
                    deps = deps.merge(dataloader_descr.dependencies)
    if dataloaders is not None or dataloaders:
        for dataloader in dataloaders:
            parsed_source, parsed_dataloader = parse_source_name(
                source, dataloader)
            sub_dataloaders = list_subcomponents(parsed_dataloader,
                                                 parsed_source, "dataloader")
            if len(sub_dataloaders) == 0:
                raise ValueError(
                    "Dataloader: {0} not found in source {1}".format(
                        parsed_dataloader, parsed_source))

            if len(sub_dataloaders) > 1:
                logger.info(
                    "Found {0} dataloaders under the dataloader name: {1}. Merging dependencies for all"
                    .format(len(sub_dataloaders), parsed_dataloader))
            for sub_dataloader in sub_dataloaders:
                dataloader_descr = kipoi.get_dataloader_descr(
                    sub_dataloader, parsed_source)
                deps = deps.merge(dataloader_descr.dependencies)

    # add Kipoi to the dependencies
    deps = KIPOI_DEPS.merge(deps)

    if vep:
        # add vep dependencies
        logger.info("Adding the vep dependencies")
        deps = VEP_DEPS.merge(deps)

    if interpret:
        # add vep dependencies
        logger.info("Adding the interpret dependencies")
        deps = INTERPRET_DEPS.merge(deps)

    if gpu:
        logger.info("Using gpu-compatible dependencies")
        deps = deps.gpu()

    if platform == "darwin":
        logger.info("Using osx-type dependencies")
        deps = deps.osx()

    return deps
Exemplo n.º 21
0
def cli_create_mutation_map(command, raw_args):
    """CLI interface to calculate mutation map data 
    """
    assert command == "create_mutation_map"
    parser = argparse.ArgumentParser(
        'kipoi postproc {}'.format(command),
        description='Predict effect of SNVs using ISM.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument(
        '-r',
        '--regions_file',
        help='Region definition as VCF or bed file. Not a required input.')
    # TODO - rename path to fpath
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        help="Output HDF5 file. To be used as input for plotting.")
    parser.add_argument(
        '-s',
        "--scores",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--score_kwargs",
        default="",
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scores. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scores. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")
    parser.add_argument(
        '-l',
        "--seq_length",
        type=int,
        default=None,
        help=
        "Optional parameter: Model input sequence length - necessary if the model does not have a "
        "pre-defined input sequence length.")

    args = parser.parse_args(raw_args)

    # extract args for kipoi.variant_effects.predict_snvs

    dataloader_arguments = parse_json_file_str(args.dataloader_args)

    if args.output is None:
        raise Exception("Output file `--output` has to be set!")

    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    regions_file = os.path.realpath(args.regions_file)
    output = os.path.realpath(args.output)
    with cd(model.source_dir):
        if not os.path.exists(regions_file):
            raise Exception("Regions inputs file does not exist: %s" %
                            args.regions_file)

        # Check that all the folders exist
        file_exists(regions_file, logger)
        dir_exists(os.path.dirname(output), logger)

        if args.dataloader is not None:
            Dl = kipoi.get_dataloader_factory(args.dataloader,
                                              args.dataloader_source)
        else:
            Dl = model.default_dataloader

    if not isinstance(args.scores, list):
        args.scores = [args.scores]

    dts = get_scoring_fns(model, args.scores, args.score_kwargs)

    # Load effect prediction related model info
    model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
        model, Dl)
    manual_seq_len = args.seq_length

    # Select the appropriate region generator and vcf or bed file input
    args.file_format = regions_file.split(".")[-1]
    bed_region_file = None
    vcf_region_file = None
    bed_to_region = None
    vcf_to_region = None
    if args.file_format == "vcf" or regions_file.endswith("vcf.gz"):
        vcf_region_file = regions_file
        if model_info.requires_region_definition:
            # Select the SNV-centered region generator
            vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg(
                model_info, seq_length=manual_seq_len)
            logger.info('Using variant-centered sequence generation.')
    elif args.file_format == "bed":
        if model_info.requires_region_definition:
            # Select the SNV-centered region generator
            bed_to_region = kipoi.postprocessing.variant_effects.BedOverlappingRg(
                model_info, seq_length=manual_seq_len)
            logger.info('Using bed-file based sequence generation.')
        bed_region_file = regions_file
    else:
        raise Exception("")

    if model_info.use_seq_only_rc:
        logger.info(
            'Model SUPPORTS simple reverse complementation of input DNA sequences.'
        )
    else:
        logger.info(
            'Model DOES NOT support simple reverse complementation of input DNA sequences.'
        )

    from kipoi.postprocessing.variant_effects.mutation_map import _generate_mutation_map
    mdmm = _generate_mutation_map(
        model,
        Dl,
        vcf_fpath=vcf_region_file,
        bed_fpath=bed_region_file,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dataloader_args=dataloader_arguments,
        vcf_to_region=vcf_to_region,
        bed_to_region=bed_to_region,
        evaluation_function_kwargs={'diff_types': dts},
    )
    mdmm.save_to_file(output)

    logger.info('Successfully generated mutation map data')