def test_var_eff_pred_varseq(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") model_dir = "examples/var_seqlen_model/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) # model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_centered_intervals.tsv" } vcf_path = "example_files/variants.vcf" out_vcf_fpath = "example_files/variants_generated.vcf" ref_out_vcf_fpath = "example_files/variants_ref_out.vcf" # with cd(model.source_dir): vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf( vcf_path) model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dataloader) writer = kipoi.postprocessing.variant_effects.VcfWriter( model, vcf_path, out_vcf_fpath) vcf_to_region = None with pytest.raises(Exception): # This has to raise an exception as the sequence length is None. vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg( model_info) res = sp.predict_snvs( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}, sync_pred_writer=writer) writer.close() # pass # assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath) compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath) os.unlink(out_vcf_fpath)
def test_var_eff_pred2(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") # Take the rbp model model_dir = "examples/rbp/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) # model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", } # # Run the actual predictions vcf_path = "example_files/variants.vcf" out_vcf_fpath = "example_files/variants_generated2.vcf" ref_out_vcf_fpath = "example_files/variants_ref_out2.vcf" restricted_regions_fpath = "example_files/restricted_regions.bed" # with cd(model.source_dir): pbd = pb.BedTool(restricted_regions_fpath) model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dataloader) vcf_to_region = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg( model_info, pbd) writer = kipoi.postprocessing.variant_effects.utils.io.VcfWriter( model, vcf_path, out_vcf_fpath) res = sp.predict_snvs( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}, sync_pred_writer=writer) writer.close() # pass #assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath) compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath) os.unlink(out_vcf_fpath)
def test_predict_variants_example_single_model(file_format, tmpdir): """kipoi predict ... """ if sys.version_info[0] == 2: pytest.skip( "Only rbp example testable at the moment, which only runs on py3") examples = "rbp", "non_bedinput_model" example_dirs = ["tests/models/{0}/".format(ex) for ex in examples] main_example_dir = example_dirs[1] tmpdir_here = tmpdir.mkdir("example") # non_bedinput_model is not compatible with restricted bed files as # alterations in region generation have no influence on that model tmpfile = str(tmpdir_here.join("out.{0}".format(file_format))) vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf"))) dataloader_kwargs = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_intervals.tsv" } dataloader_kwargs = { k: main_example_dir + v for k, v in dataloader_kwargs.items() } import json dataloader_kwargs_str = json.dumps(dataloader_kwargs) args = [ "python", os.path.abspath("./kipoi_veff/cli.py"), "score_variants", # "./", # directory example_dirs[1], "--source=dir", "--batch_size=4", "--dataloader_args='%s'" % dataloader_kwargs_str, "--input_vcf", main_example_dir + "/example_files/variants.vcf", # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] # run the if INSTALL_FLAG: args.append(INSTALL_FLAG) # run the command kipoi_veff.cli.cli_score_variants('score_variants', args[3:]) for example_dir in example_dirs[1:2]: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) model_name_safe = example_dir.replace("/", "_") vcf_tmpfile_model = vcf_tmpfile assert os.path.exists(vcf_tmpfile_model) compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile_model) ending = tmpfile.split('.')[-1] extra_output = tmpfile assert os.path.exists(extra_output) if file_format == "hdf5": data = HDF5Reader.load(extra_output) else: data = pd.read_table(extra_output)
def test_predict_variants_example_multimodel(file_format, tmpdir): """kipoi predict ... """ if sys.version_info[0] == 2: pytest.skip( "Only rbp example testable at the moment, which only runs on py3") examples = "rbp", "non_bedinput_model" example_dirs = ["examples/{0}/".format(ex) for ex in examples] main_example_dir = example_dirs[1] tmpdir_here = tmpdir.mkdir("example") # non_bedinput_model is not compatible with restricted bed files as # alterations in region generation have no influence on that model tmpfile = str(tmpdir_here.join("out.{0}".format(file_format))) vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf"))) dataloader_kwargs = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_intervals.tsv" } dataloader_kwargs = { k: main_example_dir + v for k, v in dataloader_kwargs.items() } import json dataloader_kwargs_str = json.dumps(dataloader_kwargs) args = [ "python", os.path.abspath("./kipoi/__main__.py"), "postproc", "score_variants", # "./", # directory example_dirs[0], example_dirs[1], "--source=dir", "--batch_size=4", "--dataloader_args='%s'" % dataloader_kwargs_str, "--input_vcf", main_example_dir + "/example_files/variants.vcf", # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] # run the if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args, cwd=os.path.realpath(main_example_dir) + "/../../") assert returncode == 0 assert os.path.exists(tmpfile) for example_dir in example_dirs: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) vcf_tmpfile_model = vcf_tmpfile[:-4] + example_dir.replace( "/", "_") + ".vcf" assert os.path.exists(vcf_tmpfile_model) compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile_model) if file_format == "hdf5": data = HDF5Reader.load(tmpfile) else: table_labels = [] table_starts = [] table_ends = [] tables = {} head_line_id = "KPVEP_" with open(tmpfile, "r") as ifh: for i, l in enumerate(ifh): if head_line_id in l: if (len(table_starts) > 0): table_ends.append(i - 1) table_labels.append(l.rstrip()[len(head_line_id):]) table_starts.append(i + 1) table_ends.append(i) for label, start, end in zip(table_labels, table_starts, table_ends): tables[label] = pd.read_csv(tmpfile, sep="\t", skiprows=start, nrows=end - start, index_col=0)
def test_predict_variants_example(example, restricted_bed, file_format, new_dataloader_kwargs_format, tmpdir): """kipoi predict ... """ if (example not in {"rbp", "non_bedinput_model"}) or (sys.version_info[0] == 2): pytest.skip( "Only rbp example testable at the moment, which only runs on py3") example_dir = "tests/models/{0}/".format(example) tmpdir_here = tmpdir.mkdir("example") # non_bedinput_model is not compatible with restricted bed files as # alterations in region generation have no influence on that model if restricted_bed and (example != "rbp"): pytest.skip("Resticted_bed only available for rbp_eclip") tmpfile = str(tmpdir_here.join("out.{0}".format(file_format))) vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf"))) dataloader_kwargs = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_intervals.tsv" } dataloader_kwargs = { k: example_dir + v for k, v in dataloader_kwargs.items() } if not new_dataloader_kwargs_format: import json dataloader_kwargs_str = json.dumps(dataloader_kwargs) args = [ "python", os.path.abspath("./kipoi_veff/cli.py"), "score_variants", # "./", # directory example_dir, "--source=dir", "--batch_size=4", "--dataloader_args='%s'" % dataloader_kwargs_str, "--input_vcf", temp(example_dir + "/example_files/variants.vcf", tmpdir), # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] else: dataloader_kwargs_list = [ "{0}={1}".format(key, val) for key, val in dataloader_kwargs.items() ] args = [ "python", os.path.abspath("./kipoi_veff/cli.py"), "score_variants", # "./", # directory example_dir, "--source=dir", "--batch_size=4", "--dataloader_args" ] + dataloader_kwargs_list + [ "--input_vcf", temp(example_dir + "/example_files/variants.vcf", tmpdir), # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] # run the if INSTALL_FLAG: args.append(INSTALL_FLAG) if restricted_bed: args += [ "--restriction_bed", example_dir + "/example_files/restricted_regions.bed" ] returncode = subprocess.call(args=args, cwd=".") assert returncode == 0 assert os.path.exists(tmpfile) assert os.path.exists(vcf_tmpfile) if restricted_bed: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out2.vcf", vcf_tmpfile) compare_vcfs(example_dir + "/example_files/variants_ref_out2.vcf", vcf_tmpfile) else: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) if file_format == "hdf5": data = HDF5Reader.load(tmpfile) else: table_labels = [] table_starts = [] table_ends = [] tables = {} head_line_id = "KPVEP_" with open(tmpfile, "r") as ifh: for i, l in enumerate(ifh): if head_line_id in l: if (len(table_starts) > 0): table_ends.append(i - 1) table_labels.append(l.rstrip()[len(head_line_id):]) table_starts.append(i + 1) table_ends.append(i) for label, start, end in zip(table_labels, table_starts, table_ends): tables[label] = pd.read_csv(tmpfile, sep="\t", skiprows=start, nrows=end - start, index_col=0)