예제 #1
0
def load_modisco_results(modisco_dir):
    """Load modisco_result - return

    Args:
      modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run`
        contains: modisco.h5, strand_distances.h5, kwargs.json

    Returns:
      TfModiscoResults object containing original track_set
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow
    modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
    grad_type = modisco_kwargs['grad_type']

    # load used strand distance filter
    included_samples = HDF5Reader.load(
        f"{modisco_dir}/strand_distances.h5")['included_samples']

    # load importance scores
    d = HDF5Reader.load(modisco_kwargs['imp_scores'])
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    tasks = list(d['targets']['profile'])
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    thr_hypothetical_contribs = {
        f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    thr_one_hot = one_hot[included_samples]
    thr_contrib_scores = {
        f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
        for task in tasks for gt in grad_type.split(",")
    }

    track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
        task_names=tasks,
        contrib_scores=thr_contrib_scores,
        hypothetical_contribs=thr_hypothetical_contribs,
        one_hot=thr_one_hot)

    with h5py.File(os.path.join(modisco_dir, "modisco.h5"), "r") as grp:
        mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)
    return mr, tasks, grad_type
예제 #2
0
def get_eval_predictions(tf, model, filter_dnase=False):
    """Get the predictions"""
    with HDF5Reader(os.path.join(eval_dir, tf, model + ".h5")) as r:
        y_pred = r.f['/preds'][:]

    labels_bed_file = os.path.join(root_dir,
                                   get_dl_kwargs(tf)['intervals_file'])
    df_unfiltered = pd.read_csv(labels_bed_file, sep="\t", header=None)
    df_unfiltered.columns = ['chr', 'start', 'end', 'y_true']
    if filter_dnase:
        # Filter the DNase peaks based on the overlaps
        dnase_peaks = '{ddir}/raw/tfbinding/eval/tf-DREAM/DNASE.{ctype}.relaxed.narrowPeak.gz'.format(
            ddir=ddir, ctype=TF2CT[tf])
        filtered_bed = BedTool(labels_bed_file).intersect(BedTool(dnase_peaks),
                                                          u=True,
                                                          wa=True,
                                                          f=.5)
        df_filtered = pd.read_csv(filtered_bed.fn, sep="\t", header=None)
        df_filtered.columns = ['chr', 'start', 'end', 'y_true']
        df_filtered['filtered'] = True
        keep = df_unfiltered.merge(df_filtered,
                                   how='left',
                                   on=list(
                                       df_unfiltered.columns)).filtered == True
        return df_unfiltered.y_true.values[keep], y_pred[keep]
    else:
        return df_unfiltered.y_true.values, y_pred[:]
예제 #3
0
def modisco_score(modisco_dir,
                  imp_scores,
                  output_tsv,
                  output_seqlets_pkl=None,
                  seqlet_len=25,
                  n_cores=1,
                  method="rank",
                  trim_pattern=False):
    """Find seqlet instances using modisco
    """
    add_file_logging(os.path.dirname(output_tsv), logger, 'modisco-score')
    mr, tasks, grad_type = load_modisco_results(modisco_dir)

    # load importance scores we want to score
    d = HDF5Reader.load(imp_scores)
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    hypothetical_contribs = {
        f"{task}/{gt}": mean(d['hyp_imp'][task][gt])
        for task in tasks for gt in grad_type.split(",")
    }
    contrib_scores = {
        f"{task}/{gt}": hypothetical_contribs[f"{task}/{gt}"] * one_hot
        for task in tasks for gt in grad_type.split(",")
    }

    seqlets = find_instances(mr,
                             tasks,
                             contrib_scores,
                             hypothetical_contribs,
                             one_hot,
                             seqlet_len=seqlet_len,
                             n_cores=n_cores,
                             method=method,
                             trim_pattern=trim_pattern)
    if len(seqlets) == 0:
        print("ERROR: no seqlets found!!")
        return [], None

    if output_seqlets_pkl:
        write_pkl(seqlets, output_seqlets_pkl)
    df = labelled_seqlets2df(seqlets)

    dfm = pd.DataFrame(d['metadata']['range'])
    dfm.columns = ["example_" + v for v in dfm.columns]

    df = df.merge(dfm,
                  left_on="example_idx",
                  how='left',
                  right_on="example_id")

    df.to_csv(output_tsv, sep='\t')

    return seqlets, df
예제 #4
0
    def __init__(self, fpath):
        self.fpath = fpath
        self.f = HDF5Reader(self.fpath)
        self.f.open()

        # example ranges. loaded when needed
        self.ranges = None
예제 #5
0
def load_included_samples(modisco_dir):
    modisco_dir = Path(modisco_dir)

    kwargs = read_json(modisco_dir / "kwargs.json")

    d = ImpScoreFile(kwargs["imp_scores"])
    interval_from_task = d.get_ranges().interval_from_task
    n = len(d)
    d.close()

    included_samples = np.ones((n, ), dtype=bool)
    if not kwargs.get("skip_dist_filter", False) and (
            modisco_dir / "strand_distances.h5").exists():
        included_samples = HDF5Reader.load(
            modisco_dir /
            "strand_distances.h5")['included_samples'] & included_samples

    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"]) & included_samples

    if kwargs.get("subset_tasks", None) is not None and kwargs.get(
            "filter_subset_tasks", False):
        included_samples = interval_from_task.isin(
            kwargs['subset_tasks']).values & included_samples

    return included_samples
예제 #6
0
def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.h5"))
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer = HDF5BatchWriter(tmpfile, chunk_size=4)

    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    with HDF5Reader(tmpfile) as f:
        assert np.all(
            list(f.batch_iter(2))[0]['metadata']['gene_id'] ==
            dl_batch['metadata']['gene_id'][:2])
        out = f.load_all()
        assert np.all(out['metadata']['gene_id'] == np.concatenate([
            dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id']
        ]))
        assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([
            dl_batch['metadata']['ranges']['chr'], dl_batch['metadata']
            ['ranges']['chr']
        ]))
        assert np.all(out['metadata']['ranges']["start"] == np.concatenate([
            dl_batch['metadata']['ranges']['start'], dl_batch['metadata']
            ['ranges']['start']
        ]))
        assert np.all(out['preds'][:3] == pred_batch_array)
예제 #7
0
def test_test_example(example, tmpdir):
    """kipoi test ..., add also output file writing
    """
    if example in {"rbp", "non_bedinput_model", "iris_model_template"} \
            and sys.version_info[0] == 2:
        pytest.skip("example not supported on python 2 ")

    example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir)

    args = [
        "python", "./kipoi/__main__.py", "test", "--batch_size=4", example_dir
    ]
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)
    returncode = subprocess.call(args=args)
    assert returncode == 0

    if example == 'pyt':
        # python interface, write also the output file
        output_file = os.path.join(example_dir, 'preds.h5')
        kipoi.cli.main.cli_test("test", args[3:] + ["-o", output_file])

        assert os.path.exists(output_file)
        preds = HDF5Reader.load(output_file)
        assert 'inputs' in preds
        assert 'metadata' in preds
        assert 'preds' in preds
예제 #8
0
def test_predict_example(example, tmpdir):
    """kipoi predict ...
    """
    # TODO - test -out
    # Traceback (most recent call last):
    #   File "/home/avsec/projects-work/kipoi/kipoi/__main__.py", line 60, in <module>
    #     main()
    #   File "/home/avsec/projects-work/kipoi/kipoi/__main__.py", line 56, in main
    #     command_fn(args.command, sys.argv[2:])
    #   File "/home/avsec/bin/anaconda3/lib/python3.6/site-packages/kipoi/pipeline.py", line 273, in cli_predict
    #     pred_batch = model.predict_on_batch(batch['inputs'])
    #   File "/home/avsec/bin/anaconda3/lib/python3.6/site-packages/kipoi/model.py", line 22, in predict_on_batch
    #     raise NotImplementedError
    # NotImplementedError
    # _________________________
    if example in {"rbp", "non_bedinput_model", "iris_model_template"
                   } and sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")

    example_dir = "examples/{0}".format(example)

    if example == "rbp":
        file_format = "tsv"
    else:
        file_format = "hdf5"

    print(example)
    print("tmpdir: {0}".format(tmpdir))
    tmpfile = str(tmpdir.mkdir("example").join("out.{0}".format(file_format)))

    # run the
    args = [
        "python",
        os.path.abspath("./kipoi/__main__.py"),
        "predict",
        "../",  # directory
        "--source=dir",
        "--batch_size=4",
        "--dataloader_args=test.json",
        "--output",
        tmpfile
    ]
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)
    returncode = subprocess.call(args=args,
                                 cwd=os.path.realpath(example_dir +
                                                      "/example_files"))
    assert returncode == 0

    assert os.path.exists(tmpfile)

    if file_format == "hdf5":
        data = HDF5Reader.load(tmpfile)
        assert {'metadata', 'preds'} <= set(data.keys())
    else:
        data = pd.read_csv(tmpfile, sep="\t")
        assert list(data.columns) == [
            'metadata/ranges/chr', 'metadata/ranges/end', 'metadata/ranges/id',
            'metadata/ranges/start', 'metadata/ranges/strand', 'preds/0'
        ]
예제 #9
0
 def load(cls, file_path):
     """Load the dataset from an hdf5 dataset
     """
     with HDF5Reader(file_path) as obj:
         data = obj.load_all()
         attrs = OrderedDict(obj.f.attrs)
     return cls(data, attrs)
예제 #10
0
    def load(cls, modisco_dir):
        """Instantiate ModiscoData from tf-modisco run folder
        """
        kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
        d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
        included_samples = np.load(kwargs["filter_npy"])
        # load modisco
        mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5"))
        mr.open()
        tasks = list(d['grads'].keys())

        return cls(mr, d, included_samples, tasks)
예제 #11
0
def test_preproc_example(example, tmpdir):
    """kipoi preproc ...
    """
    if example in {"rbp", "non_bedinput_model", "iris_model_template"
                   } and sys.version_info[0] == 2:
        pytest.skip("example not supported on python 2 ")
    if example in {"extended_coda", "kipoi_dataloader_decorator"}:
        # extended_coda will anyway be tested in models
        pytest.skip(
            "randomly failing on circleci without any reason. Skipping this test."
        )

    example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir)
    # example_dir = "example/models/{0}".format(example)

    tmpfile = str(tmpdir.mkdir("output", ).join("out.h5"))

    # run the
    args = [
        "python",
        os.path.abspath("./kipoi/__main__.py"),
        "preproc",
        "../",  # directory
        "--source=dir",
        "--batch_size=4",
        "--num_workers=2",
        "--dataloader_args=test.json",
        "--output",
        tmpfile
    ]
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)
    returncode = subprocess.call(args=args,
                                 cwd=os.path.realpath(example_dir +
                                                      "/example_files"))

    assert returncode == 0

    assert os.path.exists(tmpfile)

    data = HDF5Reader.load(tmpfile)

    with open(example_dir + "/dataloader.yaml", "r") as f:
        ex_descr = yaml.load(f)

    if example not in {"pyt", "sklearn_iris"}:
        assert data["inputs"].keys(
        ) == ex_descr["output_schema"]["inputs"].keys()

    if example == 'pyt':
        args[-1] = tmpfile + "2.h5"
        with kipoi.utils.cd(os.path.join(example_dir, "example_files")):
            kipoi.cli.main.cli_preproc("preproc", args[3:])
예제 #12
0
    def __init__(self, file_path,
                 include_samples=None,
                 default_imp_score='weighted'):
        self.file_path = file_path
        self.f = HDF5Reader(self.file_path)
        self.f.open()

        # use the hdf5 file handle
        self.data = self.f.f

        self.include_samples = include_samples

        self._hyp_contrib_cache = dict()
        self.default_imp_score = default_imp_score
예제 #13
0
def test_predict_activation_example(example, tmpdir):
    """Kipoi predict --layer=x with a specific output layer specified
    """
    if example in {"rbp", "non_bedinput_model", "iris_model_template"
                   } and sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    if example in {'kipoi_dataloader_decorator'}:
        pytest.skip(
            "Automatically-dowloaded input files skipped for prediction")

    example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir)
    # example_dir = "example/models/{0}".format(example)

    print(example)
    print("tmpdir: {0}".format(tmpdir))
    tmpfile = str(tmpdir.mkdir("output").join("out.h5"))

    # run the
    args = [
        "python",
        os.path.abspath("./kipoi/__main__.py"),
        "predict",
        "../",  # directory
        "--source=dir",
        "--layer",
        predict_activation_layers[example],
        "--batch_size=4",
        "--num_workers=2",
        "--dataloader_args=test.json",
        "--output",
        tmpfile
    ]
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)
    returncode = subprocess.call(args=args,
                                 cwd=os.path.realpath(example_dir +
                                                      "/example_files"))
    assert returncode == 0

    assert os.path.exists(tmpfile)

    data = HDF5Reader.load(tmpfile)
    assert {'metadata', 'preds'} <= set(data.keys())
    if example == 'pyt':
        args[-1] = tmpfile + "2.h5"
        with kipoi.utils.cd(os.path.join(example_dir, "example_files")):
            kipoi.cli.main.cli_predict("predict", args[3:])
예제 #14
0
def test_preproc_example(example, new_dataloader_kwargs_format, tmpdir):
    """kipoi preproc ...
    """
    if example in {"rbp", "non_bedinput_model", "iris_model_template"
                   } and sys.version_info[0] == 2:
        pytest.skip("example not supported on python 2 ")
    if example in {"extended_coda", "kipoi_dataloader_decorator"}:
        # extended_coda will anyway be tested in models
        pytest.skip(
            "randomly failing on circleci without any reason. Skipping this test."
        )

    example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir)
    # example_dir = "example/models/{0}".format(example)

    tmpfile = str(tmpdir.mkdir("output", ).join("out.h5"))

    if example in {"rbp"} and new_dataloader_kwargs_format:
        if example == "rbp":
            dataloader_args = [
                "intervals_file=intervals.tsv", "fasta_file=hg38_chr22.fa",
                "preproc_transformer=../dataloader_files/encodeSplines.pkl",
                "gtf_file=gencode_v25_chr22.gtf.pkl.gz",
                "tarOget_file=targets.tsv"
            ]
        elif example == "extended_coda":
            dataloader_args = [
                "intervals_file=intervals.tsv",
                "input_data_sources={'H3K27AC_subsampled':'H3K27AC_subsampled.bw'}",
                "batch_size=4"
            ]
        # run the
        args = [
            "python",
            os.path.abspath("./kipoi/__main__.py"),
            "preproc",
            "../",  # directory
            "--source=dir",
            "--batch_size=4",
            "--num_workers=2",
            "--dataloader_args"
        ] + dataloader_args + ["--output", tmpfile]

    else:
        # run the
        args = [
            "python",
            os.path.abspath("./kipoi/__main__.py"),
            "preproc",
            "../",  # directory
            "--source=dir",
            "--batch_size=4",
            "--num_workers=2",
            "--dataloader_args=test.json",
            "--output",
            tmpfile
        ]
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)
    returncode = subprocess.call(args=args,
                                 cwd=os.path.realpath(example_dir +
                                                      "/example_files"))

    assert returncode == 0

    assert os.path.exists(tmpfile)

    data = HDF5Reader.load(tmpfile)

    with open(example_dir + "/dataloader.yaml", "r") as f:
        ex_descr = yaml.load(f)

    if example not in {"pyt", "sklearn_iris"}:
        assert data["inputs"].keys(
        ) == ex_descr["output_schema"]["inputs"].keys()

    if example == 'pyt':
        args[-1] = tmpfile + "2.h5"
        with kipoi_utils.utils.cd(os.path.join(example_dir, "example_files")):
            kipoi.cli.main.cli_preproc("preproc", args[3:])
예제 #15
0
def test_predict_variants_example(example, restricted_bed, file_format,
                                  new_dataloader_kwargs_format, tmpdir):
    """kipoi predict ...
    """
    if (example not in {"rbp", "non_bedinput_model"}) or (sys.version_info[0]
                                                          == 2):
        pytest.skip(
            "Only rbp example testable at the moment, which only runs on py3")

    example_dir = "tests/models/{0}/".format(example)

    tmpdir_here = tmpdir.mkdir("example")

    # non_bedinput_model is not compatible with restricted bed files as
    # alterations in region generation have no influence on that model
    if restricted_bed and (example != "rbp"):
        pytest.skip("Resticted_bed only available for rbp_eclip")
    tmpfile = str(tmpdir_here.join("out.{0}".format(file_format)))
    vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf")))

    dataloader_kwargs = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_intervals.tsv"
    }
    dataloader_kwargs = {
        k: example_dir + v
        for k, v in dataloader_kwargs.items()
    }
    if not new_dataloader_kwargs_format:
        import json
        dataloader_kwargs_str = json.dumps(dataloader_kwargs)

        args = [
            "python",
            os.path.abspath("./kipoi_veff/cli.py"),
            "score_variants",
            # "./",  # directory
            example_dir,
            "--source=dir",
            "--batch_size=4",
            "--dataloader_args='%s'" % dataloader_kwargs_str,
            "--input_vcf",
            temp(example_dir + "/example_files/variants.vcf", tmpdir),
            # this one was now gone in the master?!
            "--output_vcf",
            vcf_tmpfile,
            "--extra_output",
            tmpfile
        ]
    else:
        dataloader_kwargs_list = [
            "{0}={1}".format(key, val)
            for key, val in dataloader_kwargs.items()
        ]
        args = [
            "python",
            os.path.abspath("./kipoi_veff/cli.py"),
            "score_variants",
            # "./",  # directory
            example_dir,
            "--source=dir",
            "--batch_size=4",
            "--dataloader_args"
        ] + dataloader_kwargs_list + [
            "--input_vcf",
            temp(example_dir + "/example_files/variants.vcf", tmpdir),
            # this one was now gone in the master?!
            "--output_vcf",
            vcf_tmpfile,
            "--extra_output",
            tmpfile
        ]

    # run the
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)

    if restricted_bed:
        args += [
            "--restriction_bed",
            example_dir + "/example_files/restricted_regions.bed"
        ]

    returncode = subprocess.call(args=args, cwd=".")
    assert returncode == 0

    assert os.path.exists(tmpfile)
    assert os.path.exists(vcf_tmpfile)

    if restricted_bed:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out2.vcf", vcf_tmpfile)
        compare_vcfs(example_dir + "/example_files/variants_ref_out2.vcf",
                     vcf_tmpfile)
    else:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile)
        compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf",
                     vcf_tmpfile)

    if file_format == "hdf5":
        data = HDF5Reader.load(tmpfile)
    else:
        table_labels = []
        table_starts = []
        table_ends = []
        tables = {}
        head_line_id = "KPVEP_"
        with open(tmpfile, "r") as ifh:
            for i, l in enumerate(ifh):
                if head_line_id in l:
                    if (len(table_starts) > 0):
                        table_ends.append(i - 1)
                    table_labels.append(l.rstrip()[len(head_line_id):])
                    table_starts.append(i + 1)
            table_ends.append(i)
        for label, start, end in zip(table_labels, table_starts, table_ends):
            tables[label] = pd.read_csv(tmpfile,
                                        sep="\t",
                                        skiprows=start,
                                        nrows=end - start,
                                        index_col=0)
예제 #16
0
def modisco_score2_single_binary(modisco_dir,
                                 output_file,
                                 imp_scores=None,
                                 trim_frac=0.08,
                                 n_jobs=20):
    """
    Equivalent of modisco_score2
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow

    cm_path = os.path.join(modisco_dir, 'centroid_seqlet_matches.csv')
    dfm_norm = pd.read_csv(cm_path)
    mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5"))
    mr.open()
    tasks = mr.tasks()

    kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
    d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    tasks = list(d['grads'].keys())
    grad_type = list(d['grads'][tasks[0]].keys())[0]
    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"])

    hyp_contrib = {
        f"{task}":
        d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    contrib = {
        f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    seq = one_hot[included_samples]
    ranges = pd.DataFrame({
        "chrom":
        d['metadata']['range']['chr'][:][included_samples],
        "start":
        d['metadata']['range']['start'][:][included_samples],
        "end":
        d['metadata']['range']['end'][:][included_samples],
        "strand":
        d['metadata']['range']['strand'][:][included_samples],
        "idx":
        np.arange(len(included_samples)),
        "interval_from_task":
        d['metadata']['interval_from_task'][:][included_samples],
    })

    print("Scanning for patterns")
    dfl = []
    mr_patterns = mr.patterns()  # [:2]
    for pattern_name in tqdm(mr_patterns):
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, importance = pattern.scan_importance(contrib,
                                                    hyp_contrib,
                                                    tasks,
                                                    n_jobs=n_jobs,
                                                    verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            importance,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        dfl.append(dfm)

    print("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)
    print("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')
    dfp.info()
    dfp.to_parquet(output_file)

    return None
예제 #17
0
def test_predict_variants_example_single_model(file_format, tmpdir):
    """kipoi predict ...
    """
    if sys.version_info[0] == 2:
        pytest.skip(
            "Only rbp example testable at the moment, which only runs on py3")

    examples = "rbp", "non_bedinput_model"
    example_dirs = ["tests/models/{0}/".format(ex) for ex in examples]
    main_example_dir = example_dirs[1]

    tmpdir_here = tmpdir.mkdir("example")

    # non_bedinput_model is not compatible with restricted bed files as
    # alterations in region generation have no influence on that model
    tmpfile = str(tmpdir_here.join("out.{0}".format(file_format)))
    vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf")))

    dataloader_kwargs = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_intervals.tsv"
    }
    dataloader_kwargs = {
        k: main_example_dir + v
        for k, v in dataloader_kwargs.items()
    }
    import json
    dataloader_kwargs_str = json.dumps(dataloader_kwargs)

    args = [
        "python",
        os.path.abspath("./kipoi_veff/cli.py"),
        "score_variants",
        # "./",  # directory
        example_dirs[1],
        "--source=dir",
        "--batch_size=4",
        "--dataloader_args='%s'" % dataloader_kwargs_str,
        "--input_vcf",
        main_example_dir + "/example_files/variants.vcf",
        # this one was now gone in the master?!
        "--output_vcf",
        vcf_tmpfile,
        "--extra_output",
        tmpfile
    ]
    # run the
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)

    # run the command
    kipoi_veff.cli.cli_score_variants('score_variants', args[3:])

    for example_dir in example_dirs[1:2]:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile)
        model_name_safe = example_dir.replace("/", "_")
        vcf_tmpfile_model = vcf_tmpfile
        assert os.path.exists(vcf_tmpfile_model)
        compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf",
                     vcf_tmpfile_model)
        ending = tmpfile.split('.')[-1]
        extra_output = tmpfile
        assert os.path.exists(extra_output)

        if file_format == "hdf5":
            data = HDF5Reader.load(extra_output)
        else:
            data = pd.read_table(extra_output)
예제 #18
0
def modisco_run(
    imp_scores,
    output_dir,
    null_imp_scores=None,
    hparams=None,
    override_hparams="",
    grad_type="weighted",
    subset_tasks=None,
    filter_subset_tasks=False,
    filter_npy=None,
    exclude_chr="",
    seqmodel=False,  # interpretation glob
    # hparams=None,
    num_workers=10,
    max_strand_distance=0.1,
    overwrite=False,
    skip_dist_filter=False,
    use_all_seqlets=False,
    merge_tasks=False,
    gpu=None,
):
    """
    Run modisco

    Args:
      imp_scores: path to the hdf5 file of importance scores
      null_imp_scores: Path to the null importance scores
      grad_type: for which output to compute the importance scores
      hparams: None, modisco hyper - parameeters: either a path to modisco.yaml or
        a ModiscoHParams object
      override_hparams: hyper - parameters overriding the settings in the hparams file
      output_dir: output file directory
      filter_npy: path to a npy file containing a boolean vector used for subsetting
      exclude_chr: comma-separated list of chromosomes to exclude
      seqmodel: If enabled, then the importance scores came from `imp-score-seqmodel`
      subset_tasks: comma-separated list of task names to use as a subset
      filter_subset_tasks: if True, run modisco only in the regions for that TF
      hparams: hyper - parameter file
      summary: which summary statistic to use for the profile gradients
      skip_dist_filter: if True, distances are not used to filter
      use_all_seqlets: if True, don't restrict the number of seqlets
      split: On which data split to compute the results
      merge_task: if True, importance scores for the tasks will be merged
      gpu: which gpu to use. If None, don't use any GPU's

    Note: when using subset_tasks, modisco will run on all the importance scores. If you wish
      to run it only for the importance scores for a particular task you should subset it to
      the peak regions of interest using `filter_npy`
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-run')
    import os
    if gpu is not None:
        create_tf_session(gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    os.environ['MKL_THREADING_LAYER'] = 'GNU'
    # import theano
    import modisco
    import modisco.tfmodisco_workflow.workflow

    if seqmodel:
        assert '/' in grad_type

    if subset_tasks == '':
        logger.warn("subset_tasks == ''. Not using subset_tasks")
        subset_tasks = None

    if subset_tasks == 'all':
        # Use all subset tasks e.g. don't subset
        subset_tasks = None

    if subset_tasks is not None:
        subset_tasks = subset_tasks.split(",")
        if len(subset_tasks) == 0:
            raise ValueError("Provide one or more subset_tasks. Found None")

    if filter_subset_tasks and subset_tasks is None:
        print("Using filter_subset_tasks=False since `subset_tasks` is None")
        filter_subset_tasks = False

    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = []

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    output_path = output_dir / "modisco.h5"
    remove_exists(output_path, overwrite)

    output_distances = output_dir / "strand_distances.h5"
    remove_exists(output_distances, overwrite)

    if filter_npy is not None:
        filter_npy = os.path.abspath(filter_npy)

    # save the hyper-parameters
    write_json(
        dict(
            imp_scores=os.path.abspath(imp_scores),
            grad_type=grad_type,
            output_dir=str(output_dir),
            subset_tasks=subset_tasks,
            filter_subset_tasks=filter_subset_tasks,
            hparams=hparams,
            null_imp_scores=null_imp_scores,
            # TODO - pack into hyper-parameters as well?
            filter_npy=filter_npy,
            exclude_chr=",".join(exclude_chr),
            skip_dist_filter=skip_dist_filter,
            use_all_seqlets=use_all_seqlets,
            max_strand_distance=max_strand_distance,
            gpu=gpu),
        os.path.join(output_dir, "kwargs.json"))

    print("-" * 40)
    # parse the hyper-parameters
    if hparams is None:
        print(f"Using default hyper-parameters")
        hp = ModiscoHParams()
    else:
        if isinstance(hparams, str):
            print(f"Loading hyper-parameters from file: {hparams}")
            hp = ModiscoHParams.load(hparams)
        else:
            assert isinstance(hparams, ModiscoHParams)
            hp = hparams
    if override_hparams:
        print(f"Overriding the following hyper-parameters: {override_hparams}")
    hp = tf.contrib.training.HParams(
        **hp.get_modisco_kwargs()).parse(override_hparams)

    if use_all_seqlets:
        hp.max_seqlets_per_metacluster = None

    # save the hyper-parameters
    print("Using the following hyper-parameters for modisco:")
    print("-" * 40)
    related_dump_yaml(ModiscoHParams(**hp.values()),
                      os.path.join(output_dir, "hparams.yaml"),
                      verbose=True)
    print("-" * 40)

    # TODO - replace with imp_scores
    d = HDF5Reader.load(imp_scores)
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    if seqmodel:
        tasks = list(d['targets'])
    else:
        tasks = list(d['targets']['profile'])

    if subset_tasks is not None:
        # validate that all the `subset_tasks`
        # are present in `tasks`
        for st in subset_tasks:
            if st not in tasks:
                raise ValueError(
                    f"subset task {st} not found in tasks: {tasks}")
        logger.info(
            f"Using the following tasks: {subset_tasks} instead of the original tasks: {tasks}"
        )
        tasks = subset_tasks

    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']

    n = len(one_hot)

    # --------------------
    # apply filters
    if not skip_dist_filter:
        print("Using profile prediction for the strand filtering")
        grad_type_filtered = 'weighted'
        distances = np.array([
            np.array([
                correlation(
                    np.ravel(d['hyp_imp'][task][grad_type_filtered][0][i]),
                    np.ravel(d['hyp_imp'][task][grad_type_filtered][1][i]))
                for i in range(n)
            ]) for task in tasks
            if len(d['hyp_imp'][task][grad_type_filtered]) == 2
        ]).T.mean(axis=-1)  # average the distances across tasks

        dist_filter = distances < max_strand_distance
        print(f"Fraction of sequences kept: {dist_filter.mean()}")

        HDF5BatchWriter.dump(output_distances, {
            "distances": distances,
            "included_samples": dist_filter
        })
    else:
        dist_filter = np.ones((n, ), dtype=bool)

    # add also the filter numpy
    if filter_npy is not None:
        print(f"Loading a filter file from {filter_npy}")
        filter_vec = np.load(filter_npy)
        dist_filter = dist_filter & filter_vec

    if filter_subset_tasks:
        assert subset_tasks is not None
        interval_from_task = pd.Series(d['metadata']['interval_from_task'])
        print(
            f"Subsetting the intervals accoring to subset_tasks: {subset_tasks}"
        )
        print(f"Number of original regions: {dist_filter.sum()}")
        dist_filter = dist_filter & interval_from_task.isin(
            subset_tasks).values
        print(
            f"Number of filtered regions after filter_subset_tasks: {dist_filter.sum()}"
        )

    # filter by chromosome
    if exclude_chr:
        logger.info(f"Excluding chromosomes: {exclude_chr}")
        chromosomes = d['metadata']['range']['chr']
        dist_filter = dist_filter & (
            ~pd.Series(chromosomes).isin(exclude_chr)).values
    # -------------------------------------------------------------
    # setup importance scores

    if seqmodel:
        thr_one_hot = one_hot[dist_filter]
        thr_hypothetical_contribs = {
            f"{task}/{gt}":
            d['hyp_imp'][task][gt.split("/")[0]][gt.split("/")[1]][dist_filter]
            for task in tasks for gt in grad_type.split(",")
        }
        thr_contrib_scores = {
            f"{task}/{gt}":
            thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
            for task in tasks for gt in grad_type.split(",")
        }
        task_names = [
            f"{task}/{gt}" for task in tasks for gt in grad_type.split(",")
        ]

    else:
        if merge_tasks:
            thr_one_hot = np.concatenate([
                one_hot[dist_filter] for task in tasks
                for gt in grad_type.split(",")
            ])
            thr_hypothetical_contribs = {
                "merged":
                np.concatenate([
                    mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks
                    for gt in grad_type.split(",")
                ])
            }

            thr_contrib_scores = {
                "merged": thr_hypothetical_contribs['merged'] * thr_one_hot
            }
            task_names = ['merged']
        else:
            thr_one_hot = one_hot[dist_filter]
            thr_hypothetical_contribs = {
                f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[dist_filter]
                for task in tasks for gt in grad_type.split(",")
            }
            thr_contrib_scores = {
                f"{task}/{gt}":
                thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
                for task in tasks for gt in grad_type.split(",")
            }
            task_names = [
                f"{task}/{gt}" for task in tasks for gt in grad_type.split(",")
            ]

    if null_imp_scores is not None:
        logger.info(f"Using null_imp_scores: {null_imp_scores}")
        null_isf = ImpScoreFile(null_imp_scores)
        null_per_pos_scores = {
            f"{task}/{gt}": v.sum(axis=-1)
            for gt in grad_type.split(",")
            for task, v in null_isf.get_contrib(imp_score=gt).items()
            if task in tasks
        }
    else:
        # default Null distribution. Requires modisco 5.0
        logger.info(f"Using default null_imp_scores")
        null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(
            num_to_samp=10000)

    # -------------------------------------------------------------
    # run modisco
    tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
        # Modisco defaults
        sliding_window_size=hp.sliding_window_size,
        flank_size=hp.flank_size,
        target_seqlet_fdr=hp.target_seqlet_fdr,
        min_passing_windows_frac=hp.min_passing_windows_frac,
        max_passing_windows_frac=hp.max_passing_windows_frac,
        min_metacluster_size=hp.min_metacluster_size,
        max_seqlets_per_metacluster=hp.max_seqlets_per_metacluster,
        seqlets_to_patterns_factory=modisco.tfmodisco_workflow.
        seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            trim_to_window_size=hp.trim_to_window_size,  # default: 30
            initial_flank_to_add=hp.initial_flank_to_add,  # default: 10
            kmer_len=hp.kmer_len,  # default: 8
            num_gaps=hp.num_gaps,  # default: 3
            num_mismatches=hp.num_mismatches,  # default: 2
            n_cores=num_workers,
            final_min_cluster_size=hp.final_min_cluster_size)  # default: 30
    )(
        task_names=task_names,
        contrib_scores=thr_contrib_scores,  # -> task score
        hypothetical_contribs=thr_hypothetical_contribs,
        one_hot=thr_one_hot,
        null_per_pos_scores=null_per_pos_scores)
    # -------------------------------------------------------------
    # save the results
    grp = h5py.File(output_path)
    tfmodisco_results.save_hdf5(grp)
예제 #19
0
def test_grad_predict_example(example):
    """kipoi postproc grad ...
    """
    if example in {"rbp", "non_bedinput_model", "iris_model_template"
                   } and sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")

    example_dir = "examples/{0}".format(example)

    for file_format in ["tsv", "hdf5"]:
        print(example)
        tmpfile = os.path.realpath(
            str("./grad_outputs.{0}".format(file_format)))
        bedgraph_temp_file = os.path.realpath(str("./grad_x_input.bed"))

        # run the
        args = [
            "python",
            os.path.abspath("./kipoi/__main__.py"),
            "postproc",
            "grad",
            "../",  # directory
            "--source=dir",
            "--batch_size=4",
            "--dataloader_args=test.json",
            "--output",
            tmpfile
        ]
        layer_args = [
            "--layer",
            predict_activation_layers[example],
        ]
        final_layer_arg = ["--final_layer"]

        if INSTALL_FLAG:
            args.append(INSTALL_FLAG)

        for la in [layer_args, final_layer_arg]:
            returncode = subprocess.call(
                args=args + la,
                cwd=os.path.realpath(example_dir + "/example_files"))
            assert returncode == 0

            assert os.path.exists(tmpfile)

            if file_format == "hdf5":
                data = HDF5Reader.load(tmpfile)
                assert {'metadata', 'preds', 'inputs'} <= set(data.keys())
                # Here we can attempt to write a bedgraph file:
                bg_args = [
                    "python",
                    os.path.abspath("./kipoi/__main__.py"),
                    "postproc",
                    "gr_inp_to_file",
                    "../",  # directory
                    "--source=dir",
                    '--output',
                    bedgraph_temp_file,
                    "--input_file",
                    tmpfile
                ]
                if grad_inputs[example] is not None:
                    bg_args += ["--model_input", grad_inputs[example]]
                returncode = subprocess.call(
                    args=bg_args,
                    cwd=os.path.realpath(example_dir + "/example_files"))

                assert returncode == 0
                assert os.path.exists(bedgraph_temp_file)
                os.unlink(bedgraph_temp_file)

            else:
                data = pd.read_csv(tmpfile, sep="\t")
                inputs_columns = data.columns.str.contains("inputs/")
                preds_columns = data.columns.str.contains("preds/")
                assert np.all(
                    np.in1d(
                        data.columns.values[preds_columns],
                        data.columns.str.replace(
                            "inputs/", "preds/").values[inputs_columns]))
                other_cols = data.columns.values[~(preds_columns
                                                   | inputs_columns)]
                expected = [
                    'metadata/ranges/chr', 'metadata/ranges/end',
                    'metadata/ranges/id', 'metadata/ranges/start',
                    'metadata/ranges/strand'
                ]
                assert np.all(np.in1d(expected, other_cols))

            os.unlink(tmpfile)
예제 #20
0
def test_predict_variants_example_multimodel(file_format, tmpdir):
    """kipoi predict ...
    """
    if sys.version_info[0] == 2:
        pytest.skip(
            "Only rbp example testable at the moment, which only runs on py3")

    examples = "rbp", "non_bedinput_model"
    example_dirs = ["examples/{0}/".format(ex) for ex in examples]
    main_example_dir = example_dirs[1]

    tmpdir_here = tmpdir.mkdir("example")

    # non_bedinput_model is not compatible with restricted bed files as
    # alterations in region generation have no influence on that model
    tmpfile = str(tmpdir_here.join("out.{0}".format(file_format)))
    vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf")))

    dataloader_kwargs = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_intervals.tsv"
    }
    dataloader_kwargs = {
        k: main_example_dir + v
        for k, v in dataloader_kwargs.items()
    }
    import json
    dataloader_kwargs_str = json.dumps(dataloader_kwargs)

    args = [
        "python",
        os.path.abspath("./kipoi/__main__.py"),
        "postproc",
        "score_variants",
        # "./",  # directory
        example_dirs[0],
        example_dirs[1],
        "--source=dir",
        "--batch_size=4",
        "--dataloader_args='%s'" % dataloader_kwargs_str,
        "--input_vcf",
        main_example_dir + "/example_files/variants.vcf",
        # this one was now gone in the master?!
        "--output_vcf",
        vcf_tmpfile,
        "--extra_output",
        tmpfile
    ]
    # run the
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)

    returncode = subprocess.call(args=args,
                                 cwd=os.path.realpath(main_example_dir) +
                                 "/../../")
    assert returncode == 0

    assert os.path.exists(tmpfile)

    for example_dir in example_dirs:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile)
        vcf_tmpfile_model = vcf_tmpfile[:-4] + example_dir.replace(
            "/", "_") + ".vcf"
        assert os.path.exists(vcf_tmpfile_model)
        compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf",
                     vcf_tmpfile_model)

    if file_format == "hdf5":
        data = HDF5Reader.load(tmpfile)
    else:
        table_labels = []
        table_starts = []
        table_ends = []
        tables = {}
        head_line_id = "KPVEP_"
        with open(tmpfile, "r") as ifh:
            for i, l in enumerate(ifh):
                if head_line_id in l:
                    if (len(table_starts) > 0):
                        table_ends.append(i - 1)
                    table_labels.append(l.rstrip()[len(head_line_id):])
                    table_starts.append(i + 1)
            table_ends.append(i)
        for label, start, end in zip(table_labels, table_starts, table_ends):
            tables[label] = pd.read_csv(tmpfile,
                                        sep="\t",
                                        skiprows=start,
                                        nrows=end - start,
                                        index_col=0)
예제 #21
0
def modisco_instances_to_bed(modisco_h5,
                             instances_parq,
                             imp_score_h5,
                             output_dir,
                             trim_frac=0.08):
    from basepair.modisco.pattern_instances import load_instances

    add_file_logging(output_dir, logger, 'modisco-instances-to-bed')
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    mr = ModiscoResult(modisco_h5)
    mr.open()

    print("load task_id")
    d = HDF5Reader(imp_score_h5)
    d.open()
    if 'hyp_imp' not in d.f.keys():
        # backcompatibility
        d['hyp_imp'] = d['grads']

    id_hash = pd.DataFrame({
        "peak_id":
        d.f['/metadata/interval_from_task'][:],
        "example_idx":
        np.arange(d.f['/metadata/interval_from_task'].shape[0])
    })

    # load the instances data frame
    print("load all instances")
    df = load_instances(instances_parq, motifs=None, dedup=True)
    # import pdb
    # pdb.set_trace()
    df = df.merge(id_hash, on="example_idx")  # append peak_id

    patterns = df.pattern.unique().tolist()
    pattern_pssms = {
        pattern: mr.get_pssm(*pattern.split("/"))
        for pattern in patterns
    }
    append_pattern_loc(df, pattern_pssms, trim_frac=trim_frac)

    # write out the results
    example_cols = [
        'example_chr', 'example_start', 'example_end', 'example_id', 'peak_id'
    ]
    df_examples = df[example_cols].drop_duplicates().sort_values(
        ["example_chr", "example_start"])
    df_examples.to_csv(output_dir / "scored_regions.bed",
                       sep='\t',
                       header=False,
                       index=False)

    df["pattern_start_rel"] = df.pattern_start + df.example_start
    df["pattern_end_rel"] = df.pattern_end + df.example_start
    df["strand"] = df.revcomp.astype(bool).map({True: "-", False: "+"})

    # TODO - update this - ?
    pattern_cols = [
        'example_chr', 'pattern_start_rel', 'pattern_end_rel', 'example_id',
        'percnormed_score', 'strand', 'peak_id', 'seqlet_score'
    ]

    (output_dir /
     "README").write_text("score_regions.bed columns: " +
                          ", ".join(example_cols) + "\n" +
                          "metacluster_<>/pattern_<>.bed columns: " +
                          ", ".join(pattern_cols))
    df_pattern = df[pattern_cols]
    for pattern in df.pattern.unique():
        out_path = output_dir / (pattern + ".bed.gz")
        out_path.parent.mkdir(parents=True, exist_ok=True)
        dfp = df_pattern[df.pattern == pattern].drop_duplicates().sort_values(
            ["example_chr", "pattern_start_rel"])
        dfp.to_csv(out_path,
                   compression='gzip',
                   sep='\t',
                   header=False,
                   index=False)
예제 #22
0
def modisco_score_single_binary(modisco_dir,
                                output_tsv,
                                output_seqlets_pkl=None,
                                seqlet_len=25,
                                n_cores=1,
                                method="rank",
                                trim_pattern=False):
    """
    Equivalent of modisco_score
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow

    kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
    d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    tasks = list(d['grads'].keys())
    grad_type = list(d['grads'][tasks[0]].keys())[0]
    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"])

    hypothetical_contribs = {
        f"{task}":
        d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    contrib_scores = {
        f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }

    print(tasks)
    track_set = workflow.prep_track_set(
        task_names=tasks,
        contrib_scores=contrib_scores,
        hypothetical_contribs=hypothetical_contribs,
        one_hot=one_hot[included_samples])

    with h5py.File(os.path.join(modisco_dir, "results.hdf5"), "r") as grp:
        mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)

    seqlets = find_instances(mr,
                             tasks,
                             contrib_scores,
                             hypothetical_contribs,
                             one_hot[included_samples],
                             seqlet_len=seqlet_len,
                             n_cores=n_cores,
                             method=method,
                             trim_pattern=trim_pattern)

    if output_seqlets_pkl:
        write_pkl(seqlets, output_seqlets_pkl)
    df = labelled_seqlets2df(seqlets)

    dfm = pd.DataFrame(d['metadata']['range'])
    dfm.columns = ["example_" + v for v in dfm.columns]
    dfm['example_id'] = d['metadata']['interval_from_task']

    df = df.merge(dfm,
                  left_on="example_idx",
                  how='left',
                  right_on="example_id")

    df.to_csv(output_tsv, sep='\t')

    return seqlets, df