def cli_info(command, raw_args): """CLI interface to predict """ assert command == "info" parser = argparse.ArgumentParser('kipoi {}'.format(command), description="Prints dataloader" + " keyword arguments.") add_model(parser) add_dataloader(parser, with_args=False) args = parser.parse_args(raw_args) # -------------------------------------------- # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: dl_info = "dataloader '{0}' from source '{1}'".format(str(args.dataloader), str(args.dataloader_source)) Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: dl_info = "default dataloader for model '{0}' from source '{1}'".format(str(model.name), str(args.source)) Dl = model.default_dataloader print("-" * 80) print("Displaying keyword arguments for {0}".format(dl_info)) print(Dl.print_args()) print("-" * 80)
def cli_test(command, raw_args): """Runs test on the model """ assert command == "test" # setup the arg-parsing parser = argparse.ArgumentParser('kipoi {}'.format(command), description='script to test model zoo submissions') add_model(parser, source="dir") parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") args = parser.parse_args(raw_args) # -------------------------------------------- if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) mh = kipoi.get_model(args.model, args.source) if not mh._sufficient_deps(mh.dependencies): # model requirements should be installed logger.warn("Required package '{0}' for model type: {1} is not listed in the dependencies". format(mh.MODEL_PACKAGE, mh.type)) # Load the test files from model source # with cd(mh.source_dir): mh.pipeline.predict_example(batch_size=args.batch_size) # if not match: # # logger.error("Expected targets don't match model predictions") # raise Exception("Expected targets don't match model predictions") logger.info('Successfully ran test_predict')
def cli_test(command, raw_args): """Runs test on the model """ assert command == "test" # setup the arg-parsing parser = argparse.ArgumentParser( 'kipoi {}'.format(command), description='script to test model zoo submissions. Example usage:\n' '`kipoi test model/directory`, where `model/directory` is the ' 'path to a directory containing a model.yaml file.') add_model(parser, source="dir") parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') args = parser.parse_args(raw_args) # -------------------------------------------- mh = kipoi.get_model(args.model, args.source) if not mh._sufficient_deps(mh.dependencies): # model requirements should be installed logger.warn( "Required package '{0}' for model type: {1} is not listed in the dependencies" .format(mh.MODEL_PACKAGE, mh.type)) # Load the test files from model source # with cd(mh.source_dir): mh.pipeline.predict_example(batch_size=args.batch_size) # if not match: # # logger.error("Expected targets don't match model predictions") # raise Exception("Expected targets don't match model predictions") logger.info('Successfully ran test_predict')
def cli_info(command, raw_args): """CLI interface to predict """ assert command == "info" parser = argparse.ArgumentParser('kipoi {}'.format(command), description="Prints dataloader" + " keyword arguments.") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") add_model(parser) add_dataloader(parser, with_args=False) args = parser.parse_args(raw_args) # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: dl_info = "dataloader '{0}' from source '{1}'".format(str(args.dataloader), str(args.dataloader_source)) Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: dl_info = "default dataloader for model '{0}' from source '{1}'".format(str(model.name), str(args.source)) Dl = model.default_dataloader print("-" * 80) print("Displaying keyword arguments for {0}".format(dl_info)) print(kipoi.print_dl_kwargs(Dl)) print("-" * 80)
def cli_get_example(command, raw_args): """Downloads the example files to the desired directory """ assert command == "get-example" # setup the arg-parsing parser = argparse.ArgumentParser('kipoi {}'.format(command), description='Get example files') add_model(parser, source="kipoi") parser.add_argument("-o", "--output", default="example", required=False, help="Output directory where to store the examples. Default: 'example'") args = parser.parse_args(raw_args) # -------------------------------------------- md = kipoi.get_model_descr(args.model, args.source) src = kipoi.get_source(args.source) # load the default dataloader if isinstance(md.default_dataloader, kipoi.specs.DataLoaderImport): with cd(src.get_model_dir(args.model)): dl_descr = md.default_dataloader.get() else: # load from directory # attach the default dataloader already to the model dl_descr = kipoi.get_dataloader_descr(os.path.join(args.model, md.default_dataloader), source=args.source) kwargs = dl_descr.download_example(output_dir=args.output, dry_run=False) logger.info("Example files downloaded to: {}".format(args.output)) logger.info("use the following dataloader kwargs:") print(json.dumps(kwargs))
def cli_info(command, raw_args): """CLI interface to predict """ assert command == "info" parser = argparse.ArgumentParser('kipoi {}'.format(command), description="Prints dataloader" + " keyword arguments.") add_model(parser) add_dataloader(parser, with_args=False) args = parser.parse_args(raw_args) # -------------------------------------------- # load model & dataloader md = kipoi.get_model_descr(args.model, args.source) src = kipoi.get_source(args.source) # load the default dataloader try: if isinstance(md.default_dataloader, kipoi.specs.DataLoaderImport): with cd(src.get_model_dir(args.model)): dl_descr = md.default_dataloader.get() else: # load from directory # attach the default dataloader already to the model dl_descr = kipoi.get_dataloader_descr(os.path.join( args.model, md.default_dataloader), source=args.source) # if kipoiseq is not installed you get an ImportError except ImportError: dl_descr = None print("-" * 80) print("'{0}' from source '{1}'".format(str(args.model), str(args.source))) print("") print("Model information") print("-----------") print(md.info.get_config_as_yaml()) if dl_descr: print("Dataloader arguments") print("--------------------") dl_descr.print_args() print("--------------------\n") print("Run `kipoi get-example {} -o example` to download example files.\n". format(args.model))
def cli_get_example(command, raw_args): """Downloads the example files to the desired directory """ assert command == "get-example" # setup the arg-parsing parser = argparse.ArgumentParser('kipoi {}'.format(command), description='Get example files') add_model(parser, source="kipoi") parser.add_argument("-o", "--output", default="example", required=False, help="Output directory where to store the examples. Default: 'example'") args = parser.parse_args(raw_args) # -------------------------------------------- mh = kipoi.get_model(args.model, args.source) kwargs = mh.default_dataloader.download_example(output_dir=args.output, dry_run=False) logger.info("Example files downloaded to: {}".format(args.output)) logger.info("use the following dataloader kwargs:") print(json.dumps(kwargs))
def cli_score_variants(command, raw_args): """CLI interface to predict """ AVAILABLE_FORMATS = ["tsv", "hdf5", "h5"] import pybedtools assert command == "score_variants" parser = argparse.ArgumentParser( 'kipoi postproc {}'.format(command), description='Predict effect of SNVs using ISM.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('-v', '--vcf_path', help='Input VCF.') # TODO - rename path to fpath parser.add_argument('-a', '--out_vcf_fpath', help='Output annotated VCF file path.', default=None) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument( '-r', '--restriction_bed', default=None, help="Regions for prediction can only be subsets of this bed file") parser.add_argument( '-o', '--output', required=False, help= "Additional output file. File format is inferred from the file path ending" + ". Available file formats are: {0}".format( ",".join(AVAILABLE_FORMATS))) parser.add_argument( '-s', "--scoring", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--scoring_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scoring. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scoring. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") args = parser.parse_args(raw_args) # extract args for kipoi.variant_effects.predict_snvs vcf_path = args.vcf_path out_vcf_fpath = args.out_vcf_fpath dataloader_arguments = parse_json_file_str(args.dataloader_args) # infer the file format args.file_format = args.output.split(".")[-1] if args.file_format not in AVAILABLE_FORMATS: logger.error("File ending: {0} for file {1} not from {2}".format( args.file_format, args.output, AVAILABLE_FORMATS)) sys.exit(1) if args.file_format in ["hdf5", "h5"]: # only if hdf5 output is used import deepdish # Check that all the folders exist file_exists(args.vcf_path, logger) dir_exists(os.path.dirname(args.out_vcf_fpath), logger) if args.output is not None: dir_exists(os.path.dirname(args.output), logger) # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader if not os.path.exists(vcf_path): raise Exception("VCF file does not exist: %s" % vcf_path) if not isinstance(args.scoring, list): args.scoring = [args.scoring] dts = _get_scoring_fns(model, args.scoring, args.scoring_kwargs) # Load effect prediction related model info model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dl) # Select the appropriate region generator if args.restriction_bed is not None: # Select the restricted SNV-centered region generator pbd = pybedtools.BedTool(args.restriction_bed) vcf_to_region = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg( model_info, pbd) logger.info( 'Restriction bed file defined. Only variants in defined regions will be tested.' 'Only defined regions will be tested.') elif model_info.requires_region_definition: # Select the SNV-centered region generator vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg( model_info) logger.info('Using variant-centered sequence generation.') else: # No regions can be defined for the given model, VCF overlap will be inferred, hence tabixed VCF is necessary vcf_to_region = None # Make sure that the vcf is tabixed vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf( vcf_path) logger.info( 'Dataloader does not accept definition of a regions bed-file. Only VCF-variants that lie within' 'produced regions can be predicted') if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) # Get a vcf output writer if needed if out_vcf_fpath is not None: logger.info('Annotated VCF will be written to %s.' % str(out_vcf_fpath)) vcf_writer = kipoi.postprocessing.variant_effects.VcfWriter( model, vcf_path, out_vcf_fpath) else: vcf_writer = None keep_predictions = args.output is not None res = kipoi.postprocessing.variant_effects.predict_snvs( model, Dl, vcf_path, batch_size=args.batch_size, num_workers=args.num_workers, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, evaluation_function_kwargs={"diff_types": dts}, sync_pred_writer=vcf_writer, return_predictions=keep_predictions) # tabular files if args.output is not None: if args.file_format in ["tsv"]: for i, k in enumerate(res): # Remove an old file if it is still there... if i == 0: try: os.unlink(args.output) except Exception: pass with open(args.output, "w") as ofh: ofh.write("KPVEP_%s\n" % k.upper()) res[k].to_csv(args.output, sep="\t", mode="a") if args.file_format in ["hdf5", "h5"]: deepdish.io.save(args.output, res) logger.info('Successfully predicted samples')
def cli_predict(command, raw_args): """CLI interface to predict """ assert command == "predict" parser = argparse.ArgumentParser('kipoi {}'.format(command), description='Run the model prediction.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument("-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument("-k", "--keep_inputs", action='store_true', help="Keep the inputs in the output file. ") parser.add_argument("-l", "--layer", help="Which output layer to use to make the predictions. If specified," + "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`") parser.add_argument('-o', '--output', required=True, nargs="+", help="Output files. File format is inferred from the file path ending. Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str(args.dataloader_args) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}". format(ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: ending = output.split('.')[-1] W = writers.FILE_SUFFIX_MAP[ending] logger.info("Using {0} for file {1}".format(W.__name__, output)) if ending == "tsv": assert W == writers.TsvBatchWriter use_writers.append(writers.TsvBatchWriter(file_path=output, nested_sep="/")) elif ending == "bed": assert W == writers.BedBatchWriter use_writers.append(writers.BedBatchWriter(file_path=output, dataloader_schema=dl.output_schema.metadata, header=True)) elif ending in ["hdf5", "h5"]: assert W == writers.HDF5BatchWriter use_writers.append(writers.HDF5BatchWriter(file_path=output)) else: logger.error("Unknown file format: {0}".format(ending)) sys.exit(1) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.output_schema.compatible_with_batch(batch): logger.warn("First batch of data is not compatible with the dataloader schema.") # make the prediction if args.layer is None: pred_batch = model.predict_on_batch(batch['inputs']) else: pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer) # write out the predictions, metadata (, inputs, targets) output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs) for writer in use_writers: writer.batch_write(output_batch) for writer in use_writers: writer.close() logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
def cli_test(command, raw_args): """Runs test on the model """ assert command == "test" # setup the arg-parsing parser = argparse.ArgumentParser('kipoi {}'.format(command), description='script to test model zoo submissions. Example usage:\n' '`kipoi test model/directory`, where `model/directory` is the ' 'path to a directory containing a model.yaml file.') add_model(parser, source="dir") parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument("-o", "--output", default=None, required=False, help="Output hdf5 file") parser.add_argument("-s", "--skip-expect", action='store_true', help="Skip validating the expected predictions if test.expect field is specified under model.yaml") parser.add_argument("-e", "--expect", default=None, help="File path to the hdf5 file of predictions produced by kipoi test -o file.h5 " "or kipoi predict -o file.h5 --keep_inputs. Overrides test.expect in model.yaml") args = parser.parse_args(raw_args) # -------------------------------------------- mh = kipoi.get_model(args.model, args.source) if not mh._sufficient_deps(mh.dependencies): # model requirements should be installed logger.warning("Required package '{0}' for model type: {1} is not listed in the dependencies". format(mh.MODEL_PACKAGE, mh.type)) # Load the test files from model source mh.pipeline.predict_example(batch_size=args.batch_size, output_file=args.output) if (mh.test.expect is not None or args.expect is not None) \ and not args.skip_expect and args.output is None: if args.expect is not None: # `expect` specified from the CLI expect = args.expect else: # `expect` taken from model.yaml if isinstance(mh.test.expect, kipoi.specs.RemoteFile): # download the file output_dir = kipoi.get_source(args.source).get_model_download_dir(args.model) makedir_exist_ok(output_dir) mh.test.expect = mh.test.expect.get_file(os.path.join(output_dir, 'test.expect.h5')) expect = mh.test.expect logger.info('Testing if the predictions match the expected ones in the file: {}'.format(expect)) logger.info('Desired precision (number of matching decimal places): {}'.format(mh.test.precision_decimal)) # iteratively load the expected file expected = kipoi.readers.HDF5Reader(expect) expected.open() it = expected.batch_iter(batch_size=args.batch_size) for i, batch in enumerate(tqdm(it, total=len(expected) // args.batch_size)): if i == 0 and ('inputs' not in batch or 'preds' not in batch): raise ValueError("test.expect file requires 'inputs' and 'preds' " "to be specified. Available keys: {}".format(list(expected))) pred_batch = mh.predict_on_batch(batch['inputs']) # compare to the predictions # import ipdb # ipdb.set_trace() try: compare_numpy_dict(pred_batch, batch['preds'], exact=False, decimal=mh.test.precision_decimal) except Exception as e: logger.error("Model predictions don't match the expected predictions." "expected: {}\nobserved: {}. Exception: {}".format(batch['preds'], pred_batch, e)) expected.close() sys.exit(1) expected.close() logger.info('All predictions match') logger.info('Successfully ran test_predict')
def cli_predict(command, raw_args): """CLI interface to predict """ assert command == "predict" parser = argparse.ArgumentParser('kipoi {}'.format(command), description='Run the model prediction.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument("-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-k", "--keep_inputs", action='store_true', help="Keep the inputs in the output file. ") parser.add_argument("-l", "--layer", help="Which output layer to use to make the predictions. If specified," + "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`") parser.add_argument("--singularity", action='store_true', help="Run `kipoi predict` in the appropriate singularity container. " "Containters will get downloaded to ~/.kipoi/envs/ or to " "$SINGULARITY_CACHEDIR if set") parser.add_argument('-o', '--output', required=True, nargs="+", help="Output files. File format is inferred from the file path ending. Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str_or_arglist(args.dataloader_args, parser) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}". format(ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # singularity_command if args.singularity: from kipoi.cli.singularity import singularity_command logger.info("Running kipoi predict in the singularity container") # Drop the singularity flag raw_args = [x for x in raw_args if x != '--singularity'] singularity_command(['kipoi', command] + raw_args, args.model, dataloader_kwargs, output_files=args.output, source=args.source, dry_run=False) return None # -------------------------------------------- # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: writer = writers.get_writer(output, metadata_schema=dl.get_output_schema().metadata) if writer is None: logger.error("Unknown file format: {0}".format(ending)) sys.exit() else: use_writers.append(writer) output_writers = writers.MultipleBatchWriter(use_writers) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.get_output_schema().compatible_with_batch(batch): logger.warning("First batch of data is not compatible with the dataloader schema.") # make the prediction if args.layer is None: pred_batch = model.predict_on_batch(batch['inputs']) else: pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer) # write out the predictions, metadata (, inputs, targets) output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs) output_writers.batch_write(output_batch) output_writers.close() logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
def create_tf_session(visiblegpus, per_process_gpu_memory_fraction=0.45): import os import tensorflow as tf import keras.backend as K os.environ['CUDA_VISIBLE_DEVICES'] = str(visiblegpus) session_config = tf.ConfigProto() session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction session = tf.Session(config=session_config) K.set_session(session) return session if __name__ == '__main__': parser = argparse.ArgumentParser( description='Transfer-learn a Keras model from Kipoi') add_model(parser) add_dataloader(parser, with_args=False) parser.add_argument('--dl_kwargs_train', help="training data-loader kwargs") parser.add_argument('--dl_kwargs_eval', help="Evaluation data-loader kwargs") parser.add_argument('-t', '--tasks', type=int, help='Number of transferred tasks') parser.add_argument('-o', '--output', help='Output file directory') parser.add_argument('--transfer_to', help='Layer to which to transfer the model') parser.add_argument( '--freeze_to', default=None,
def cli_create_mutation_map(command, raw_args): """CLI interface to calculate mutation map data """ assert command == "create_mutation_map" parser = argparse.ArgumentParser( 'kipoi postproc {}'.format(command), description='Predict effect of SNVs using ISM.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument( '-r', '--regions_file', help='Region definition as VCF or bed file. Not a required input.') # TODO - rename path to fpath parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument( '-o', '--output', required=True, help="Output HDF5 file. To be used as input for plotting.") parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scores. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scores. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( '-l', "--seq_length", type=int, default=None, help= "Optional parameter: Model input sequence length - necessary if the model does not have a " "pre-defined input sequence length.") args = parser.parse_args(raw_args) # extract args for kipoi.variant_effects.predict_snvs dataloader_arguments = parse_json_file_str(args.dataloader_args) if args.output is None: raise Exception("Output file `--output` has to be set!") # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) regions_file = os.path.realpath(args.regions_file) output = os.path.realpath(args.output) with cd(model.source_dir): if not os.path.exists(regions_file): raise Exception("Regions inputs file does not exist: %s" % args.regions_file) # Check that all the folders exist file_exists(regions_file, logger) dir_exists(os.path.dirname(output), logger) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader if not isinstance(args.scores, list): args.scores = [args.scores] dts = get_scoring_fns(model, args.scores, args.score_kwargs) # Load effect prediction related model info model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dl) manual_seq_len = args.seq_length # Select the appropriate region generator and vcf or bed file input args.file_format = regions_file.split(".")[-1] bed_region_file = None vcf_region_file = None bed_to_region = None vcf_to_region = None if args.file_format == "vcf" or regions_file.endswith("vcf.gz"): vcf_region_file = regions_file if model_info.requires_region_definition: # Select the SNV-centered region generator vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg( model_info, seq_length=manual_seq_len) logger.info('Using variant-centered sequence generation.') elif args.file_format == "bed": if model_info.requires_region_definition: # Select the SNV-centered region generator bed_to_region = kipoi.postprocessing.variant_effects.BedOverlappingRg( model_info, seq_length=manual_seq_len) logger.info('Using bed-file based sequence generation.') bed_region_file = regions_file else: raise Exception("") if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) from kipoi.postprocessing.variant_effects.mutation_map import _generate_mutation_map mdmm = _generate_mutation_map( model, Dl, vcf_fpath=vcf_region_file, bed_fpath=bed_region_file, batch_size=args.batch_size, num_workers=args.num_workers, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, bed_to_region=bed_to_region, evaluation_function_kwargs={'diff_types': dts}, ) mdmm.save_to_file(output) logger.info('Successfully generated mutation map data')
def cli_grad_to_file(command, raw_args): """ CLI to save seq inputs of grad*input to a bigwig file """ assert command == "gr_inp_to_file" parser = argparse.ArgumentParser('kipoi postproc {}'.format(command), description='Save grad*input in a file.') add_model(parser) add_dataloader(parser, with_args=True) # TODO - rename path to fpath parser.add_argument('-f', '--input_file', required=False, help="Input HDF5 file produced from `grad`") parser.add_argument('-o', '--output', required=False, help="Output bigwig for bedgraph file") parser.add_argument( '--sample', required=False, type=int, default=None, help= "Input line for which the BigWig file should be generated. If not defined all" "samples will be written.") parser.add_argument( '--model_input', required=False, default=None, help= "Model input name to be used for plotting. As defined in model.yaml. Can be omitted if" "model only has one input.") args = parser.parse_args(raw_args) # Check that all the folders exist dir_exists(os.path.dirname(args.output), logger) # -------------------------------------------- # install args import matplotlib.pyplot matplotlib.pyplot.switch_backend('agg') import matplotlib.pylab as plt from kipoi.postprocessing.variant_effects.mutation_map import MutationMapPlotter from kipoi.postprocessing.gradient_vis.vis import GradPlotter from kipoi.writers import BedGraphWriter logger.info('Loading gradient results file and model info...') gp = GradPlotter.from_hdf5(args.input_file, model=args.model, source=args.source) if args.sample is not None: samples = [args.sample] else: samples = list(range(gp.get_num_samples(args.model_input))) if args.output.endswith(".bed") or args.output.endswith(".bedgraph"): of_obj = BedGraphWriter(args.output) else: raise Exception("Output file format not supported!") logger.info('Writing...') for sample in samples: gp.write(sample, model_input=args.model_input, writer_obj=of_obj) logger.info('Saving...') of_obj.close() logger.info('Successfully wrote grad*input to file.')
def cli_grad(command, raw_args): """CLI interface to predict """ from .main import prepare_batch from kipoi.model import GradientMixin assert command == "grad" from tqdm import tqdm parser = argparse.ArgumentParser( 'kipoi {}'.format(command), description='Save gradients and inputs to a hdf5 file.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument( "-l", "--layer", default=None, help="Which output layer to use to make the predictions. If specified," + "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`", required=False) parser.add_argument( "--final_layer", help= "Alternatively to `--layer` this flag can be used to indicate that the last layer should " "be used.", action='store_true') parser.add_argument( "--pre_nonlinearity", help= "Flag indicating that it should checked whether the selected output is post activation " "function. If a non-linear activation function is used attempt to use its input. This " "feature is not available for all models.", action='store_true') parser.add_argument( "-f", "--filter_idx", help= "Filter index that should be inspected with gradients. If not set all filters will " + "be used.", default=None) parser.add_argument( "-a", "--avg_func", help= "Averaging function to be applied across selected filters (`--filter_idx`) in " + "layer `--layer`.", choices=GradientMixin.allowed_functions, default="sum") parser.add_argument( '--selected_fwd_node', help="If the selected layer has multiple inbound connections in " "the graph then those can be selected here with an integer " "index. Not necessarily supported by all models.", default=None, type=int) parser.add_argument( '-o', '--output', required=True, nargs="+", help= "Output files. File format is inferred from the file path ending. Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str(args.dataloader_args) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}".format( ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) layer = args.layer if layer is None and not args.final_layer: raise Exception( "A layer has to be selected explicitely using `--layer` or implicitely by using the" "`--final_layer` flag.") # Not a good idea # if layer is not None and isint(layer): # logger.warn("Interpreting `--layer` value as integer layer index!") # layer = int(args.layer) # load model & dataloader model = kipoi.get_model(args.model, args.source) if not isinstance(model, GradientMixin): raise Exception("Model does not support gradient calculation.") if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) filter_idx_parsed = None if args.filter_idx is not None: filter_idx_parsed = parse_filter_slice(args.filter_idx) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: ending = output.split('.')[-1] W = writers.FILE_SUFFIX_MAP[ending] logger.info("Using {0} for file {1}".format(W.__name__, output)) if ending == "tsv": assert W == writers.TsvBatchWriter use_writers.append( writers.TsvBatchWriter(file_path=output, nested_sep="/")) elif ending == "bed": raise Exception("Please use tsv or hdf5 output format.") elif ending in ["hdf5", "h5"]: assert W == writers.HDF5BatchWriter use_writers.append(writers.HDF5BatchWriter(file_path=output)) else: logger.error("Unknown file format: {0}".format(ending)) sys.exit(1) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.output_schema.compatible_with_batch(batch): logger.warn( "First batch of data is not compatible with the dataloader schema." ) # make the prediction pred_batch = model.input_grad(batch['inputs'], filter_idx=filter_idx_parsed, avg_func=args.avg_func, layer=layer, final_layer=args.final_layer, selected_fwd_node=args.selected_fwd_node, pre_nonlinearity=args.pre_nonlinearity) # write out the predictions, metadata (, inputs, targets) # always keep the inputs so that input*grad can be generated! # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True) output_batch = batch output_batch["grads"] = pred_batch for writer in use_writers: writer.batch_write(output_batch) for writer in use_writers: writer.close() logger.info('Done! Gradients stored in {0}'.format(",".join(args.output)))
def cli_ism(command, raw_args): # TODO: find a way to define the model output selection """CLI interface to predict """ # from .main import prepare_batch assert command == "ism" from tqdm import tqdm from .ism import Mutation parser = argparse.ArgumentParser('kipoi interpret {}'.format(command), description='Calculate DeepLIFT scores.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("--model_input", help="Name of the model input that should be scored.", required=True) parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default=None, nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scores. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scores. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( "-c", "--category_axis", help="Using the selected model input with `--model_input`: Which " "dimension of that array contains the one-hot encoded categories?" " e.g. for a one-hot encoded DNA-sequence" "array with input shape (1000, 4) for a single sample, " "`category_dim` is 1, for (4, 1000) `category_dim`" "is 0.", default=1, type=int, required=False) parser.add_argument( "-f", "--output_sel_fn", help="Define an output selection function in order to return effects" "on the output of the function. example definitoin: " "`--output_sel_fn my_file.py::my_sel_fn`", default=None, required=False) parser.add_argument( '-o', '--output', required=True, nargs="+", help="Output files. File format is inferred from the file path ending. " "Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str(args.dataloader_args) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}".format( ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # -------------------------------------------- if not isinstance(args.scores, list): args.scores = [args.scores] # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: ending = output.split('.')[-1] W = writers.FILE_SUFFIX_MAP[ending] logger.info("Using {0} for file {1}".format(W.__name__, output)) if ending == "tsv": assert W == writers.TsvBatchWriter use_writers.append( writers.TsvBatchWriter(file_path=output, nested_sep="/")) elif ending == "bed": raise Exception("Please use tsv or hdf5 output format.") elif ending in ["hdf5", "h5"]: assert W == writers.HDF5BatchWriter use_writers.append(writers.HDF5BatchWriter(file_path=output)) else: logger.error("Unknown file format: {0}".format(ending)) sys.exit(1) output_sel_fn = None if args.output_sel_fn is not None: file_path, obj_name = tuple(args.output_sel_fn.split("::")) output_sel_fn = getattr(load_module(file_path), obj_name) m = Mutation(model, args.model_input, scores=args.scores, score_kwargs=args.score_kwargs, batch_size=args.batch_size, output_sel_fn=output_sel_fn, category_axis=args.category_axis, test_ref_ref=True) out_batches = {} # Loop through the data, make predictions, save the output.. # TODO: batch writer fails because it tries to concatenate on highest dimension rather than the lowest! for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.output_schema.compatible_with_batch(batch): logger.warn( "First batch of data is not compatible with the dataloader schema." ) # calculate scores without reference for the moment. pred_batch = m.score(batch['inputs']) # with the current writers it's not possible to store the scores and the model inputs in the same file output_batch = {} output_batch["scores"] = pred_batch for k in output_batch: if k not in out_batches: out_batches[k] = [] out_batches[k].append(output_batch[k]) # concatenate batches: full_output = { k: np.concatenate([np.array(el) for el in v]) for k, v in out_batches.items() } logger.info('Full output shape: {0}'.format( str(full_output["scores"].shape))) for writer in use_writers: writer.batch_write(full_output) for writer in use_writers: writer.close() logger.info('Done! ISM stored in {0}'.format(",".join(args.output)))
def cli_feature_importance(command, raw_args): """CLI interface to predict """ # from .main import prepare_batch assert command == "feature_importance" parser = argparse.ArgumentParser( 'kipoi {}'.format(command), description='Save gradients and inputs to a hdf5 file.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument("--imp_score", help="Importance score name", choices=available_importance_scores()) parser.add_argument("--imp_score_kwargs", help="Importance score kwargs") parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") # TODO - handle the reference-based importance scores... # io parser.add_argument( '-o', '--output', required=True, nargs="+", help= "Output files. File format is inferred from the file path ending. Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str(args.dataloader_args) imp_score_kwargs = parse_json_file_str(args.imp_score_kwargs) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}".format( ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source, with_dataloader=args.dataloader is None) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) # get_importance_score ImpScore = get_importance_score(args.imp_score) if not ImpScore.is_compatible(model): raise ValueError("model not compatible with score: {0}".format( args.imp_score)) impscore = ImpScore(model, **imp_score_kwargs) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: ending = output.split('.')[-1] W = writers.FILE_SUFFIX_MAP[ending] logger.info("Using {0} for file {1}".format(W.__name__, output)) if ending == "tsv": assert W == writers.TsvBatchWriter use_writers.append( writers.TsvBatchWriter(file_path=output, nested_sep="/")) elif ending == "bed": raise Exception("Please use tsv or hdf5 output format.") elif ending in ["hdf5", "h5"]: assert W == writers.HDF5BatchWriter use_writers.append(writers.HDF5BatchWriter(file_path=output)) else: logger.error("Unknown file format: {0}".format(ending)) sys.exit(1) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.output_schema.compatible_with_batch(batch): logger.warn( "First batch of data is not compatible with the dataloader schema." ) # make the prediction # TODO - handle the reference-based importance scores... importance_scores = impscore.score(batch['inputs']) # write out the predictions, metadata (, inputs, targets) # always keep the inputs so that input*grad can be generated! # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True) output_batch = batch output_batch["importance_scores"] = importance_scores for writer in use_writers: writer.batch_write(output_batch) for writer in use_writers: writer.close() logger.info('Done! Importance scores stored in {0}'.format(",".join( args.output)))
def cli_deeplift(command, raw_args): """CLI interface to predict """ # TODO: find a way to define the "reference" for a scored sequence. # from .main import prepare_batch assert command == "deeplift" from tqdm import tqdm from .referencebased import DeepLift from .referencebased import get_mxts_modes parser = argparse.ArgumentParser('kipoi interpret {}'.format(command), description='Calculate DeepLIFT scores.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument( "-l", "--layer", type=int, default=None, help="With respect to which layer the scores should be calculated.", required=True) parser.add_argument( "--pre_nonlinearity", help= "Flag indicating that it should checked whether the selected output is post activation " "function. If a non-linear activation function is used attempt to use its input. This " "feature is not available for all models.", action='store_true') parser.add_argument( "-f", "--filter_idx", help="Filter index that should be inspected with gradients", default=None, required=True, type=int) parser.add_argument("-m", "--mxts_mode", help="Deeplift score, allowed values are: %s" % str(list(get_mxts_modes().keys())), default='rescale_conv_revealcancel_fc') parser.add_argument( '-o', '--output', required=True, nargs="+", help= "Output files. File format is inferred from the file path ending. Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str(args.dataloader_args) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}".format( ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # -------------------------------------------- layer = args.layer if layer is None and not args.final_layer: raise Exception( "A layer has to be selected explicitely using `--layer` or implicitely by using the" "`--final_layer` flag.") # Not a good idea # if layer is not None and isint(layer): # logger.warn("Interpreting `--layer` value as integer layer index!") # layer = int(args.layer) # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: ending = output.split('.')[-1] W = writers.FILE_SUFFIX_MAP[ending] logger.info("Using {0} for file {1}".format(W.__name__, output)) if ending == "tsv": assert W == writers.TsvBatchWriter use_writers.append( writers.TsvBatchWriter(file_path=output, nested_sep="/")) elif ending == "bed": raise Exception("Please use tsv or hdf5 output format.") elif ending in ["hdf5", "h5"]: assert W == writers.HDF5BatchWriter use_writers.append(writers.HDF5BatchWriter(file_path=output)) else: logger.error("Unknown file format: {0}".format(ending)) sys.exit(1) d = DeepLift(model, output_layer=args.layer, task_idx=args.filter_idx, preact=args.pre_nonlinearity, mxts_mode=args.mxts_mode, batch_size=args.batch_size) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.output_schema.compatible_with_batch(batch): logger.warn( "First batch of data is not compatible with the dataloader schema." ) # calculate scores without reference for the moment. pred_batch = d.score(batch['inputs'], None) # write out the predictions, metadata (, inputs, targets) # always keep the inputs so that input*grad can be generated! # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True) output_batch = batch output_batch["scores"] = pred_batch for writer in use_writers: writer.batch_write(output_batch) for writer in use_writers: writer.close() logger.info('Done! Gradients stored in {0}'.format(",".join(args.output)))