def test_var_eff_pred_varseq(tmpdir): model_name = "DeepSEA/variantEffects" if INSTALL_REQ: install_model_requirements(model_name, "kipoi", and_dataloaders=True) # model = kipoi.get_model(model_name, source="kipoi") # The preprocessor Dataloader = SeqIntervalDl # dataloader_arguments = {"intervals_file": "example_files/intervals.bed", "fasta_file": "example_files/hg38_chr22.fa", "required_seq_len": 1000, "alphabet_axis": 1, "dummy_axis": 2, "label_dtype": str} dataloader_arguments = {k: model.source_dir + "/" + v if isinstance(v, str) else v for k, v in dataloader_arguments.items()} vcf_path = "tests/data/variants.vcf" out_vcf_fpath = str(tmpdir.mkdir("variants_generated", ).join("out.vcf")) # vcf_path = kipoi_veff.ensure_tabixed_vcf(vcf_path) model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) writer = kipoi_veff.VcfWriter( model, vcf_path, out_vcf_fpath, standardise_var_id=True) vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) res = sp.predict_snvs(model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, batch_size=32, vcf_to_region=vcf_to_region, sync_pred_writer=writer) writer.close() assert os.path.exists(out_vcf_fpath)
def add_scores(self, snp_vcf_path='../data/snp_vcfs', out_dir=None): if out_dir is None: out_dir = '../data/model_scores/' + self.model_name if not os.path.exists(out_dir): os.makedirs(out_dir) file_names = os.listdir(self.snp_vcf_path) for file_name in file_names: chrom = file_name.split('.')[0] Dataloader = self.model.default_dataloader vcf_path = self.snp_vcf_path + '/' + file_name out_vcf_fpath = out_dir + '/' + chrom + '.vcf' print(vcf_path) print(out_vcf_fpath) writer = VcfWriter(self.model, vcf_path, out_vcf_fpath) model_info = kipoi_veff.ModelInfoExtractor(self.model, Dataloader) # vcf_to_region will generate a variant-centered regions when presented a VCF record. vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) dataloader_arguments = { "fasta_file": '../data/fasta_files/chr' + chrom + '.fa' } sp.predict_snvs( self.model, Dataloader, vcf_path, batch_size=32, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}}, sync_pred_writer=writer)
def test_mutation_map(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") # Take the rbp model model_dir = "tests/models/rbp/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", } dataloader_arguments = { k: model_dir + v for k, v in dataloader_arguments.items() } # # Run the actual predictions vcf_path = model_dir + "example_files/first_variant.vcf" # model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) mdmm = mm._generate_mutation_map( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}) with cd(model.source_dir): mdmm.save_to_file("example_files/first_variant_mm_totest.hdf5") from kipoi_veff.utils.generic import read_hdf5 reference = read_hdf5("example_files/first_variant_mm.hdf5") obs = read_hdf5("example_files/first_variant_mm.hdf5") compare_rec(reference[0], obs[0]) import matplotlib matplotlib.pyplot.switch_backend('agg') mdmm.plot_mutmap(0, "seq", "diff", "rbp_prb") os.unlink("example_files/first_variant_mm_totest.hdf5")
def _get_vcf_to_region(model_info, restriction_bed, seq_length): import kipoi import pybedtools # Select the appropriate region generator if restriction_bed is not None: # Select the restricted SNV-centered region generator pbd = pybedtools.BedTool(restriction_bed) vcf_to_region = kipoi_veff.SnvPosRestrictedRg(model_info, pbd) logger.info( 'Restriction bed file defined. Only variants in defined regions will be tested.' 'Only defined regions will be tested.') elif model_info.requires_region_definition: # Select the SNV-centered region generator vcf_to_region = kipoi_veff.SnvCenteredRg(model_info, seq_length=seq_length) logger.info('Using variant-centered sequence generation.') else: # No regions can be defined for the given model, VCF overlap will be inferred, hence tabixed VCF is necessary vcf_to_region = None logger.info( 'Dataloader does not accept definition of a regions bed-file. Only VCF-variants that lie within' 'produced regions can be predicted') return vcf_to_region
model_name = "DeepBind/Homo_sapiens/TF/D00299.003_SELEX_ATF7" # get the model model = kipoi.get_model(model_name) # get the dataloader factory Dataloader = model.default_dataloader vcf_path = "../data/test.vcf" # The output vcf path, based on the input file name out_vcf_fpath = vcf_path[:-4] + "%s.vcf" % model_name.replace("/", "_") # The writer object that will output the annotated VCF writer = VcfWriter(model, vcf_path, out_vcf_fpath) # Information extraction from dataloader and model model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) # vcf_to_region will generate a variant-centered regions when presented a VCF record. vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) dataloader_arguments = {"fasta_file": "../data/fasta_files/chr1.fa"} sp.predict_snvs( model, Dataloader, vcf_path, batch_size=32, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}}, sync_pred_writer=writer) vcf_reader = KipoiVCFParser(out_vcf_fpath) entries = [el for el in vcf_reader] #print(pd.DataFrame(entries).head().iloc[:,:7])
def cli_create_mutation_map(command, raw_args): """CLI interface to calculate mutation map data """ assert command == "create_mutation_map" parser = argparse.ArgumentParser( 'kipoi veff {}'.format(command), description='Predict effect of SNVs using ISM.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument( '-r', '--regions_file', help='Region definition as VCF or bed file. Not a required input.') parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument( '-o', '--output', required=True, help="Output HDF5 file. To be used as input for plotting.") parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scores. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scores. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( '-l', "--seq_length", type=int, default=None, help= "Optional parameter: Model input sequence length - necessary if the model does not have a " "pre-defined input sequence length.") parser.add_argument( "--singularity", action='store_true', help="Run `kipoi predict` in the appropriate singularity container. " "Containters will get downloaded to ~/.kipoi/envs/ or to " "$SINGULARITY_CACHEDIR if set") args = parser.parse_args(raw_args) # extract args for kipoi.variant_effects.predict_snvs print("DL ARGS", args.dataloader_args) dataloader_arguments = parse_json_file_str_or_arglist(args.dataloader_args) #dataloader_arguments = parse_json_file_str(args.dataloader_args) if args.output is None: raise Exception("Output file `--output` has to be set!") if args.singularity: from kipoi.cli.singularity import singularity_command logger.info( "Running kipoi veff in the singularity container".format(command)) # Drop the singularity flag raw_args = [x for x in raw_args if x != '--singularity'] singularity_command(['kipoi', 'veff', command] + raw_args, args.model, dataloader_arguments, output_files=args.output, source=args.source, dry_run=False) return None # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) regions_file = os.path.realpath(args.regions_file) output = os.path.realpath(args.output) with cd(model.source_dir): if not os.path.exists(regions_file): raise Exception("Regions inputs file does not exist: %s" % args.regions_file) # Check that all the folders exist file_exists(regions_file, logger) dir_exists(os.path.dirname(output), logger) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader if not isinstance(args.scores, list): args.scores = [args.scores] # TODO - why is this function not a method of the model class? dts = get_scoring_fns(model, args.scores, args.score_kwargs) # Load effect prediction related model info model_info = kipoi_veff.ModelInfoExtractor(model, Dl) manual_seq_len = args.seq_length # Select the appropriate region generator and vcf or bed file input args.file_format = regions_file.split(".")[-1] bed_region_file = None vcf_region_file = None bed_to_region = None vcf_to_region = None if args.file_format == "vcf" or regions_file.endswith("vcf.gz"): vcf_region_file = regions_file if model_info.requires_region_definition: # Select the SNV-centered region generator vcf_to_region = kipoi_veff.SnvCenteredRg(model_info, seq_length=manual_seq_len) logger.info('Using variant-centered sequence generation.') elif args.file_format == "bed": if model_info.requires_region_definition: # Select the SNV-centered region generator bed_to_region = kipoi_veff.BedOverlappingRg( model_info, seq_length=manual_seq_len) logger.info('Using bed-file based sequence generation.') bed_region_file = regions_file else: raise Exception("") if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) from kipoi_veff.mutation_map import _generate_mutation_map mdmm = _generate_mutation_map( model, Dl, vcf_fpath=vcf_region_file, bed_fpath=bed_region_file, batch_size=args.batch_size, num_workers=args.num_workers, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, bed_to_region=bed_to_region, evaluation_function_kwargs={'diff_types': dts}, ) mdmm.save_to_file(output) logger.info('Successfully generated mutation map data')