Exemplo n.º 1
0
def cli_preproc(command, raw_args):
    """Preprocess:
    - Run the dataloader and store the results to a (hdf5) file
    """
    assert command == "preproc"
    parser = argparse.ArgumentParser(
        'kipoi {}'.format(command),
        description='Run the dataloader and save the output to an hdf5 file.')
    add_dataloader_main(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in data loading')
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="Output hdf5 file")
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    dir_exists(os.path.dirname(args.output), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_dataloader_requirements(args.dataloader,
                                                       args.source)
    Dataloader = kipoi.get_dataloader_factory(args.dataloader, args.source)

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dataloader,
                                                       dataloader_kwargs)
    dataloader = Dataloader(**dataloader_kwargs)

    it = dataloader.batch_iter(batch_size=args.batch_size,
                               num_workers=args.num_workers)

    logger.info("Writing to the hdf5 file: {0}".format(args.output))
    writer = writers.HDF5BatchWriter(file_path=args.output)

    for i, batch in enumerate(tqdm(it)):
        # check that the first batch was indeed correct
        if i == 0 and not Dataloader.output_schema.compatible_with_batch(
                batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )
        writer.batch_write(batch)

    writer.close()
    logger.info("Done!")
Exemplo n.º 2
0
def test_gradplotter():
    from kipoi.postprocessing.gradient_vis.vis import GradPlotter, get_selector
    example = "rbp"
    if example in {"rbp", "non_bedinput_model", "iris_model_template"} and sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")

    example_dir = "tests/models/{0}".format(example)

    output = os.path.realpath(example_dir + "/grad_outputs.hdf5")
    try:
        os.unlink(output)
    except:
        pass

    writer = writers.HDF5BatchWriter(file_path=output)
    get_example_data(example, predict_activation_layers[example], writer=writer)

    gp = GradPlotter.from_hdf5(output, example_dir, source="dir")

    # test get_num_samples
    assert gp.data['inputs']['seq'].shape[0] == gp.get_num_samples("seq")

    # once we have a gp instance:
    exp_ret_fns = [gp._select_ds_dict, gp._select_ds_list, gp._select_ds_ndarray]
    for model_schema_yaml, exp_ret_fn in zip(MODEL_SCHEMA_EXAMPLES, exp_ret_fns):
        schema = ModelSchema.from_config(from_yaml(model_schema_yaml))
        ret_fn, ret_lables = gp._get_ds_extractor(schema.inputs)
        assert ret_fn == exp_ret_fn
        assert ret_lables == ['seq']
    try:
        os.unlink(output)
    except:
        pass
Exemplo n.º 3
0
def test_deeplift():
    # return True
    example = "tal1_model"
    layer = predict_activation_layers[example]
    example_dir = "tests/models/{0}".format(example)
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(example_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    with open(example_dir + "/example_files/test.json", "r") as ifh:
        dataloader_arguments = json.load(ifh)

    for k in dataloader_arguments:
        dataloader_arguments[k] = "example_files/" + dataloader_arguments[k]

    d = DeepLift(model,
                 output_layer=-2,
                 task_idx=0,
                 preact=None,
                 mxts_mode='grad_times_inp')

    new_ofname = model.source_dir + "/example_files/deeplift_grads_pred.hdf5"
    if os.path.exists(new_ofname):
        os.unlink(new_ofname)

    writer = writers.HDF5BatchWriter(file_path=new_ofname)

    with kipoi.utils.cd(model.source_dir):
        dl = Dataloader(**dataloader_arguments)
        it = dl.batch_iter(batch_size=32, num_workers=0)
        # Loop through the data, make predictions, save the output
        for i, batch in enumerate(tqdm(it)):
            # make the prediction
            pred_batch = d.score(batch['inputs'], None)

            # Using Avanti's recommendation to check whether the model conversion has worked.
            pred_batch_fwd = d.predict_on_batch(batch['inputs'])
            orig_pred_batch_fwd = model.predict_on_batch(batch['inputs'])
            assert np.all(pred_batch_fwd == orig_pred_batch_fwd)

        output_batch = batch
        output_batch["input_grad"] = pred_batch
        writer.batch_write(output_batch)
    writer.close()

    new_res = readers.HDF5Reader.load(new_ofname)
    ref_res = readers.HDF5Reader.load(model.source_dir +
                                      "/example_files/grads.hdf5")
    assert np.all(
        np.isclose(new_res['input_grad'],
                   (ref_res['inputs'] * ref_res['grads'])))

    if os.path.exists(new_ofname):
        os.unlink(new_ofname)
Exemplo n.º 4
0
def test_score():
    example = "tal1_model"
    layer = predict_activation_layers[example]
    example_dir = "example/models/{0}".format(example)
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(example_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    with open(example_dir + "/example_files/test.json", "r") as ifh:
        dataloader_arguments = json.load(ifh)

    for k in dataloader_arguments:
        dataloader_arguments[k] = "example_files/" + dataloader_arguments[k]

    g = Gradient(model, None, layer=layer, avg_func="sum")

    if os.path.exists(model.source_dir + "/example_files/grads_pred.hdf5"):
        os.unlink(model.source_dir + "/example_files/grads_pred.hdf5")

    writer = writers.HDF5BatchWriter(file_path=model.source_dir + "/example_files/grads_pred.hdf5")

    with kipoi_utils.utils.cd(model.source_dir):
        dl = Dataloader(**dataloader_arguments)
        it = dl.batch_iter(batch_size=32, num_workers=0)
        # Loop through the data, make predictions, save the output
        for i, batch in enumerate(tqdm(it)):
            # make the prediction
            pred_batch = g.score(batch['inputs'])
            output_batch = batch
            output_batch["grads"] = pred_batch
            writer.batch_write(output_batch)
        writer.close()

    obj1 = readers.HDF5Reader.load(model.source_dir + "/example_files/grads_pred.hdf5")
    obj2 = readers.HDF5Reader.load(model.source_dir + "/example_files/grads.hdf5")
    kipoi_utils.utils.compare_numpy_dict(obj1, obj2)

    if os.path.exists(model.source_dir + "/example_files/grads_pred.hdf5"):
        os.unlink(model.source_dir + "/example_files/grads_pred.hdf5")
Exemplo n.º 5
0
def cli_predict(command, raw_args):
    """CLI interface to predict
    """
    assert command == "predict"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='Run the model prediction.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Batch size to use in prediction')
    parser.add_argument("-n", "--num_workers", type=int, default=0,
                        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i", "--install_req", action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument("-k", "--keep_inputs", action='store_true',
                        help="Keep the inputs in the output file. ")
    parser.add_argument("-l", "--layer",
                        help="Which output layer to use to make the predictions. If specified," +
                        "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`")
    parser.add_argument('-o', '--output', required=True, nargs="+",
                        help="Output files. File format is inferred from the file path ending. Available file formats are: " +
                        ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".
                         format(ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            assert W == writers.BedBatchWriter
            use_writers.append(writers.BedBatchWriter(file_path=output,
                                                      dataloader_schema=dl.output_schema.metadata,
                                                      header=True))
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn("First batch of data is not compatible with the dataloader schema.")

        # make the prediction
        if args.layer is None:
            pred_batch = model.predict_on_batch(batch['inputs'])
        else:
            pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer)

        # write out the predictions, metadata (, inputs, targets)
        output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs)
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
Exemplo n.º 6
0
def cli_grad(command, raw_args):
    """CLI interface to predict
    """
    from .main import prepare_batch
    from kipoi.model import GradientMixin
    assert command == "grad"
    from tqdm import tqdm
    parser = argparse.ArgumentParser(
        'kipoi {}'.format(command),
        description='Save gradients and inputs to a hdf5 file.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        "-l",
        "--layer",
        default=None,
        help="Which output layer to use to make the predictions. If specified,"
        +
        "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`",
        required=False)
    parser.add_argument(
        "--final_layer",
        help=
        "Alternatively to `--layer` this flag can be used to indicate that the last layer should "
        "be used.",
        action='store_true')
    parser.add_argument(
        "--pre_nonlinearity",
        help=
        "Flag indicating that it should checked whether the selected output is post activation "
        "function. If a non-linear activation function is used attempt to use its input. This "
        "feature is not available for all models.",
        action='store_true')
    parser.add_argument(
        "-f",
        "--filter_idx",
        help=
        "Filter index that should be inspected with gradients. If not set all filters will "
        + "be used.",
        default=None)
    parser.add_argument(
        "-a",
        "--avg_func",
        help=
        "Averaging function to be applied across selected filters (`--filter_idx`) in "
        + "layer `--layer`.",
        choices=GradientMixin.allowed_functions,
        default="sum")
    parser.add_argument(
        '--selected_fwd_node',
        help="If the selected layer has multiple inbound connections in "
        "the graph then those can be selected here with an integer "
        "index. Not necessarily supported by all models.",
        default=None,
        type=int)
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help=
        "Output files. File format is inferred from the file path ending. Available file formats are: "
        + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)

    layer = args.layer
    if layer is None and not args.final_layer:
        raise Exception(
            "A layer has to be selected explicitely using `--layer` or implicitely by using the"
            "`--final_layer` flag.")

    # Not a good idea
    # if layer is not None and isint(layer):
    #    logger.warn("Interpreting `--layer` value as integer layer index!")
    #    layer = int(args.layer)

    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if not isinstance(model, GradientMixin):
        raise Exception("Model does not support gradient calculation.")

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    filter_idx_parsed = None
    if args.filter_idx is not None:
        filter_idx_parsed = parse_filter_slice(args.filter_idx)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # make the prediction
        pred_batch = model.input_grad(batch['inputs'],
                                      filter_idx=filter_idx_parsed,
                                      avg_func=args.avg_func,
                                      layer=layer,
                                      final_layer=args.final_layer,
                                      selected_fwd_node=args.selected_fwd_node,
                                      pre_nonlinearity=args.pre_nonlinearity)

        # write out the predictions, metadata (, inputs, targets)
        # always keep the inputs so that input*grad can be generated!
        # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
        output_batch = batch
        output_batch["grads"] = pred_batch
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Gradients stored in {0}'.format(",".join(args.output)))
Exemplo n.º 7
0
def cli_ism(command, raw_args):
    # TODO: find a way to define the model output selection
    """CLI interface to predict
    """
    # from .main import prepare_batch
    assert command == "ism"
    from tqdm import tqdm
    from .ism import Mutation

    parser = argparse.ArgumentParser('kipoi interpret {}'.format(command),
                                     description='Calculate DeepLIFT scores.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("--model_input",
                        help="Name of the model input that should be scored.",
                        required=True)
    parser.add_argument(
        '-s',
        "--scores",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--score_kwargs",
        default=None,
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scores. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scores. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")
    parser.add_argument(
        "-c",
        "--category_axis",
        help="Using the selected model input with `--model_input`: Which "
        "dimension of that array contains the one-hot encoded categories?"
        " e.g. for a one-hot encoded DNA-sequence"
        "array with input shape (1000, 4) for a single sample, "
        "`category_dim` is 1, for (4, 1000) `category_dim`"
        "is 0.",
        default=1,
        type=int,
        required=False)
    parser.add_argument(
        "-f",
        "--output_sel_fn",
        help="Define an output selection function in order to return effects"
        "on the output of the function. example definitoin: "
        "`--output_sel_fn my_file.py::my_sel_fn`",
        default=None,
        required=False)
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help="Output files. File format is inferred from the file path ending. "
        "Available file formats are: " +
        ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    if not isinstance(args.scores, list):
        args.scores = [args.scores]

    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    output_sel_fn = None
    if args.output_sel_fn is not None:
        file_path, obj_name = tuple(args.output_sel_fn.split("::"))
        output_sel_fn = getattr(load_module(file_path), obj_name)

    m = Mutation(model,
                 args.model_input,
                 scores=args.scores,
                 score_kwargs=args.score_kwargs,
                 batch_size=args.batch_size,
                 output_sel_fn=output_sel_fn,
                 category_axis=args.category_axis,
                 test_ref_ref=True)

    out_batches = {}

    # Loop through the data, make predictions, save the output..
    # TODO: batch writer fails because it tries to concatenate on highest dimension rather than the lowest!
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # calculate scores without reference for the moment.
        pred_batch = m.score(batch['inputs'])

        # with the current writers it's not possible to store the scores and the model inputs in the same file
        output_batch = {}
        output_batch["scores"] = pred_batch

        for k in output_batch:
            if k not in out_batches:
                out_batches[k] = []
            out_batches[k].append(output_batch[k])

    # concatenate batches:
    full_output = {
        k: np.concatenate([np.array(el) for el in v])
        for k, v in out_batches.items()
    }
    logger.info('Full output shape: {0}'.format(
        str(full_output["scores"].shape)))

    for writer in use_writers:
        writer.batch_write(full_output)

    for writer in use_writers:
        writer.close()
    logger.info('Done! ISM stored in {0}'.format(",".join(args.output)))
Exemplo n.º 8
0
def cli_feature_importance(command, raw_args):
    """CLI interface to predict
    """
    # from .main import prepare_batch
    assert command == "feature_importance"
    parser = argparse.ArgumentParser(
        'kipoi {}'.format(command),
        description='Save gradients and inputs to a hdf5 file.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument("--imp_score",
                        help="Importance score name",
                        choices=available_importance_scores())
    parser.add_argument("--imp_score_kwargs", help="Importance score kwargs")
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    # TODO - handle the reference-based importance scores...

    # io
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help=
        "Output files. File format is inferred from the file path ending. Available file formats are: "
        + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)
    imp_score_kwargs = parse_json_file_str(args.imp_score_kwargs)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)

    # load model & dataloader
    model = kipoi.get_model(args.model,
                            args.source,
                            with_dataloader=args.dataloader is None)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # get_importance_score
    ImpScore = get_importance_score(args.imp_score)
    if not ImpScore.is_compatible(model):
        raise ValueError("model not compatible with score: {0}".format(
            args.imp_score))
    impscore = ImpScore(model, **imp_score_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # make the prediction
        # TODO - handle the reference-based importance scores...
        importance_scores = impscore.score(batch['inputs'])

        # write out the predictions, metadata (, inputs, targets)
        # always keep the inputs so that input*grad can be generated!
        # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
        output_batch = batch
        output_batch["importance_scores"] = importance_scores
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Importance scores stored in {0}'.format(",".join(
        args.output)))
Exemplo n.º 9
0
def cli_deeplift(command, raw_args):
    """CLI interface to predict
    """
    # TODO: find a way to define the "reference" for a scored sequence.
    # from .main import prepare_batch
    assert command == "deeplift"
    from tqdm import tqdm
    from .referencebased import DeepLift
    from .referencebased import get_mxts_modes
    parser = argparse.ArgumentParser('kipoi interpret {}'.format(command),
                                     description='Calculate DeepLIFT scores.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument(
        "-l",
        "--layer",
        type=int,
        default=None,
        help="With respect to which layer the scores should be calculated.",
        required=True)
    parser.add_argument(
        "--pre_nonlinearity",
        help=
        "Flag indicating that it should checked whether the selected output is post activation "
        "function. If a non-linear activation function is used attempt to use its input. This "
        "feature is not available for all models.",
        action='store_true')
    parser.add_argument(
        "-f",
        "--filter_idx",
        help="Filter index that should be inspected with gradients",
        default=None,
        required=True,
        type=int)
    parser.add_argument("-m",
                        "--mxts_mode",
                        help="Deeplift score, allowed values are: %s" %
                        str(list(get_mxts_modes().keys())),
                        default='rescale_conv_revealcancel_fc')
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help=
        "Output files. File format is inferred from the file path ending. Available file formats are: "
        + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------

    layer = args.layer
    if layer is None and not args.final_layer:
        raise Exception(
            "A layer has to be selected explicitely using `--layer` or implicitely by using the"
            "`--final_layer` flag.")

    # Not a good idea
    # if layer is not None and isint(layer):
    #    logger.warn("Interpreting `--layer` value as integer layer index!")
    #    layer = int(args.layer)

    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    d = DeepLift(model,
                 output_layer=args.layer,
                 task_idx=args.filter_idx,
                 preact=args.pre_nonlinearity,
                 mxts_mode=args.mxts_mode,
                 batch_size=args.batch_size)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # calculate scores without reference for the moment.
        pred_batch = d.score(batch['inputs'], None)

        # write out the predictions, metadata (, inputs, targets)
        # always keep the inputs so that input*grad can be generated!
        # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
        output_batch = batch
        output_batch["scores"] = pred_batch
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Gradients stored in {0}'.format(",".join(args.output)))