예제 #1
0
def test_var_eff_pred_varseq():
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    model_dir = "examples/var_seqlen_model/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)
    #
    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    #
    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_centered_intervals.tsv"
    }
    vcf_path = "example_files/variants.vcf"
    out_vcf_fpath = "example_files/variants_generated.vcf"
    ref_out_vcf_fpath = "example_files/variants_ref_out.vcf"
    #
    with cd(model.source_dir):
        vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
            vcf_path)
        model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
            model, Dataloader)
        writer = kipoi.postprocessing.variant_effects.VcfWriter(
            model, vcf_path, out_vcf_fpath)
        vcf_to_region = None
        with pytest.raises(Exception):
            # This has to raise an exception as the sequence length is None.
            vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg(
                model_info)
        res = sp.predict_snvs(
            model,
            Dataloader,
            vcf_path,
            dataloader_args=dataloader_arguments,
            evaluation_function=analyse_model_preds,
            batch_size=32,
            vcf_to_region=vcf_to_region,
            evaluation_function_kwargs={'diff_types': {
                'diff': Diff("mean")
            }},
            sync_pred_writer=writer)
        writer.close()
        # pass
        # assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath)
        compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath)
        os.unlink(out_vcf_fpath)
예제 #2
0
def test_var_eff_pred2():
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    # Take the rbp model
    model_dir = "examples/rbp/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)
    #
    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    #
    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
    }
    #
    # Run the actual predictions
    vcf_path = "example_files/variants.vcf"
    out_vcf_fpath = "example_files/variants_generated2.vcf"
    ref_out_vcf_fpath = "example_files/variants_ref_out2.vcf"
    restricted_regions_fpath = "example_files/restricted_regions.bed"
    #
    with cd(model.source_dir):
        pbd = pb.BedTool(restricted_regions_fpath)
        model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
            model, Dataloader)
        vcf_to_region = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg(
            model_info, pbd)
        writer = kipoi.postprocessing.variant_effects.utils.io.VcfWriter(
            model, vcf_path, out_vcf_fpath)
        res = sp.predict_snvs(
            model,
            Dataloader,
            vcf_path,
            dataloader_args=dataloader_arguments,
            evaluation_function=analyse_model_preds,
            batch_size=32,
            vcf_to_region=vcf_to_region,
            evaluation_function_kwargs={'diff_types': {
                'diff': Diff("mean")
            }},
            sync_pred_writer=writer)
        writer.close()
        # pass
        #assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath)
        compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath)
        os.unlink(out_vcf_fpath)
예제 #3
0
def test_predict_variants_example_single_model(file_format, tmpdir):
    """kipoi predict ...
    """
    if sys.version_info[0] == 2:
        pytest.skip(
            "Only rbp example testable at the moment, which only runs on py3")

    examples = "rbp", "non_bedinput_model"
    example_dirs = ["tests/models/{0}/".format(ex) for ex in examples]
    main_example_dir = example_dirs[1]

    tmpdir_here = tmpdir.mkdir("example")

    # non_bedinput_model is not compatible with restricted bed files as
    # alterations in region generation have no influence on that model
    tmpfile = str(tmpdir_here.join("out.{0}".format(file_format)))
    vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf")))

    dataloader_kwargs = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_intervals.tsv"
    }
    dataloader_kwargs = {
        k: main_example_dir + v
        for k, v in dataloader_kwargs.items()
    }
    import json
    dataloader_kwargs_str = json.dumps(dataloader_kwargs)

    args = [
        "python",
        os.path.abspath("./kipoi_veff/cli.py"),
        "score_variants",
        # "./",  # directory
        example_dirs[1],
        "--source=dir",
        "--batch_size=4",
        "--dataloader_args='%s'" % dataloader_kwargs_str,
        "--input_vcf",
        main_example_dir + "/example_files/variants.vcf",
        # this one was now gone in the master?!
        "--output_vcf",
        vcf_tmpfile,
        "--extra_output",
        tmpfile
    ]
    # run the
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)

    # run the command
    kipoi_veff.cli.cli_score_variants('score_variants', args[3:])

    for example_dir in example_dirs[1:2]:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile)
        model_name_safe = example_dir.replace("/", "_")
        vcf_tmpfile_model = vcf_tmpfile
        assert os.path.exists(vcf_tmpfile_model)
        compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf",
                     vcf_tmpfile_model)
        ending = tmpfile.split('.')[-1]
        extra_output = tmpfile
        assert os.path.exists(extra_output)

        if file_format == "hdf5":
            data = HDF5Reader.load(extra_output)
        else:
            data = pd.read_table(extra_output)
예제 #4
0
def test_predict_variants_example_multimodel(file_format, tmpdir):
    """kipoi predict ...
    """
    if sys.version_info[0] == 2:
        pytest.skip(
            "Only rbp example testable at the moment, which only runs on py3")

    examples = "rbp", "non_bedinput_model"
    example_dirs = ["examples/{0}/".format(ex) for ex in examples]
    main_example_dir = example_dirs[1]

    tmpdir_here = tmpdir.mkdir("example")

    # non_bedinput_model is not compatible with restricted bed files as
    # alterations in region generation have no influence on that model
    tmpfile = str(tmpdir_here.join("out.{0}".format(file_format)))
    vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf")))

    dataloader_kwargs = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_intervals.tsv"
    }
    dataloader_kwargs = {
        k: main_example_dir + v
        for k, v in dataloader_kwargs.items()
    }
    import json
    dataloader_kwargs_str = json.dumps(dataloader_kwargs)

    args = [
        "python",
        os.path.abspath("./kipoi/__main__.py"),
        "postproc",
        "score_variants",
        # "./",  # directory
        example_dirs[0],
        example_dirs[1],
        "--source=dir",
        "--batch_size=4",
        "--dataloader_args='%s'" % dataloader_kwargs_str,
        "--input_vcf",
        main_example_dir + "/example_files/variants.vcf",
        # this one was now gone in the master?!
        "--output_vcf",
        vcf_tmpfile,
        "--extra_output",
        tmpfile
    ]
    # run the
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)

    returncode = subprocess.call(args=args,
                                 cwd=os.path.realpath(main_example_dir) +
                                 "/../../")
    assert returncode == 0

    assert os.path.exists(tmpfile)

    for example_dir in example_dirs:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile)
        vcf_tmpfile_model = vcf_tmpfile[:-4] + example_dir.replace(
            "/", "_") + ".vcf"
        assert os.path.exists(vcf_tmpfile_model)
        compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf",
                     vcf_tmpfile_model)

    if file_format == "hdf5":
        data = HDF5Reader.load(tmpfile)
    else:
        table_labels = []
        table_starts = []
        table_ends = []
        tables = {}
        head_line_id = "KPVEP_"
        with open(tmpfile, "r") as ifh:
            for i, l in enumerate(ifh):
                if head_line_id in l:
                    if (len(table_starts) > 0):
                        table_ends.append(i - 1)
                    table_labels.append(l.rstrip()[len(head_line_id):])
                    table_starts.append(i + 1)
            table_ends.append(i)
        for label, start, end in zip(table_labels, table_starts, table_ends):
            tables[label] = pd.read_csv(tmpfile,
                                        sep="\t",
                                        skiprows=start,
                                        nrows=end - start,
                                        index_col=0)
예제 #5
0
def test_predict_variants_example(example, restricted_bed, file_format,
                                  new_dataloader_kwargs_format, tmpdir):
    """kipoi predict ...
    """
    if (example not in {"rbp", "non_bedinput_model"}) or (sys.version_info[0]
                                                          == 2):
        pytest.skip(
            "Only rbp example testable at the moment, which only runs on py3")

    example_dir = "tests/models/{0}/".format(example)

    tmpdir_here = tmpdir.mkdir("example")

    # non_bedinput_model is not compatible with restricted bed files as
    # alterations in region generation have no influence on that model
    if restricted_bed and (example != "rbp"):
        pytest.skip("Resticted_bed only available for rbp_eclip")
    tmpfile = str(tmpdir_here.join("out.{0}".format(file_format)))
    vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf")))

    dataloader_kwargs = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_intervals.tsv"
    }
    dataloader_kwargs = {
        k: example_dir + v
        for k, v in dataloader_kwargs.items()
    }
    if not new_dataloader_kwargs_format:
        import json
        dataloader_kwargs_str = json.dumps(dataloader_kwargs)

        args = [
            "python",
            os.path.abspath("./kipoi_veff/cli.py"),
            "score_variants",
            # "./",  # directory
            example_dir,
            "--source=dir",
            "--batch_size=4",
            "--dataloader_args='%s'" % dataloader_kwargs_str,
            "--input_vcf",
            temp(example_dir + "/example_files/variants.vcf", tmpdir),
            # this one was now gone in the master?!
            "--output_vcf",
            vcf_tmpfile,
            "--extra_output",
            tmpfile
        ]
    else:
        dataloader_kwargs_list = [
            "{0}={1}".format(key, val)
            for key, val in dataloader_kwargs.items()
        ]
        args = [
            "python",
            os.path.abspath("./kipoi_veff/cli.py"),
            "score_variants",
            # "./",  # directory
            example_dir,
            "--source=dir",
            "--batch_size=4",
            "--dataloader_args"
        ] + dataloader_kwargs_list + [
            "--input_vcf",
            temp(example_dir + "/example_files/variants.vcf", tmpdir),
            # this one was now gone in the master?!
            "--output_vcf",
            vcf_tmpfile,
            "--extra_output",
            tmpfile
        ]

    # run the
    if INSTALL_FLAG:
        args.append(INSTALL_FLAG)

    if restricted_bed:
        args += [
            "--restriction_bed",
            example_dir + "/example_files/restricted_regions.bed"
        ]

    returncode = subprocess.call(args=args, cwd=".")
    assert returncode == 0

    assert os.path.exists(tmpfile)
    assert os.path.exists(vcf_tmpfile)

    if restricted_bed:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out2.vcf", vcf_tmpfile)
        compare_vcfs(example_dir + "/example_files/variants_ref_out2.vcf",
                     vcf_tmpfile)
    else:
        # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile)
        compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf",
                     vcf_tmpfile)

    if file_format == "hdf5":
        data = HDF5Reader.load(tmpfile)
    else:
        table_labels = []
        table_starts = []
        table_ends = []
        tables = {}
        head_line_id = "KPVEP_"
        with open(tmpfile, "r") as ifh:
            for i, l in enumerate(ifh):
                if head_line_id in l:
                    if (len(table_starts) > 0):
                        table_ends.append(i - 1)
                    table_labels.append(l.rstrip()[len(head_line_id):])
                    table_starts.append(i + 1)
            table_ends.append(i)
        for label, start, end in zip(table_labels, table_starts, table_ends):
            tables[label] = pd.read_csv(tmpfile,
                                        sep="\t",
                                        skiprows=start,
                                        nrows=end - start,
                                        index_col=0)