Exemplo n.º 1
0
    def predict_example(self, batch_size=32, output_file=None, **kwargs):
        """Run model prediction for the example file

        # Arguments
            batch_size: batch_size
            output_file: if not None, inputs and predictions are stored to `output_file` path
            **kwargs: Further arguments passed to batch_iter
        """
        logger.info('Initialized data generator. Running batches...')

        from kipoi.writers import get_writer
        from kipoi.cli.main import prepare_batch

        if output_file is not None:
            output_file = os.path.abspath(output_file)
            if os.path.exists(output_file):
                raise ValueError(
                    "Output file: {} already exists.".format(output_file))
        with cd(self.dataloader_cls.source_dir):
            # init the dataloader
            dl = self.dataloader_cls.init_example()
            logger.info('Returned data schema correct')

            if output_file is not None:
                writer = get_writer(output_file,
                                    dl.get_output_schema().metadata, **kwargs)

            it = dl.batch_iter(batch_size=batch_size)

            # test that all predictions go through
            pred_list = []
            for i, batch in enumerate(tqdm(it)):
                if i == 0 and not self.dataloader_cls.get_output_schema(
                ).compatible_with_batch(batch):
                    logger.warning(
                        "First batch of data is not compatible with the dataloader schema."
                    )
                pred_batch = self.model.predict_on_batch(batch['inputs'])
                if 'keep_metadata' in kwargs and kwargs.get(
                        'keep_metadata') and 'metadata' in batch:
                    pred_list.append({
                        'preds': pred_batch,
                        'metadata': batch['metadata']
                    })
                else:
                    pred_list.append(pred_batch)
                if output_file is not None:
                    output_batch = prepare_batch(
                        batch,
                        pred_batch,
                        keep_inputs=True,
                        keep_metadata='keep_metadata' in kwargs
                        and kwargs.get('keep_metadata'))
                    writer.batch_write(output_batch)

            if output_file is not None:
                writer.close()

        logger.info('predict_example done!')
        return numpy_collate_concat(pred_list)
Exemplo n.º 2
0
    def predict_to_file(self,
                        output_file,
                        dataloader_kwargs,
                        batch_size=32,
                        keep_inputs=False,
                        keep_metadata=False,
                        **kwargs):
        """Make predictions and write them iteratively to a file

        # Arguments
            output_file: output file path. File format is inferred from the file path ending. Available file formats are:
                 'bed', 'h5', 'hdf5', 'tsv'
            dataloader_kwargs: Keyword arguments passed to the dataloader
            batch_size: Batch size used for the dataloader
            keep_inputs: if True, inputs and targets will also be written to the output file.
            keep_metadata: if True, metadata will also be written to the output file.
            **kwargs: Further arguments passed to batch_iter
        """
        from kipoi.writers import get_writer
        from kipoi.cli.main import prepare_batch

        # setup dataloader
        validate_kwargs(self.dataloader_cls, dataloader_kwargs)
        dl = self.dataloader_cls(**dataloader_kwargs)
        it = dl.batch_iter(batch_size=batch_size, **kwargs)
        writer = get_writer(output_file,
                            dl.get_output_schema().metadata, **kwargs)

        for i, batch in enumerate(tqdm(it)):
            if i == 0 and not self.dataloader_cls.get_output_schema(
            ).compatible_with_batch(batch):
                logger.warning(
                    "First batch of data is not compatible with the dataloader schema."
                )
            pred_batch = self.model.predict_on_batch(batch['inputs'])
            output_batch = prepare_batch(batch,
                                         pred_batch,
                                         keep_inputs=keep_inputs,
                                         keep_metadata=keep_metadata)
            writer.batch_write(output_batch)
        writer.close()
Exemplo n.º 3
0
def cli_predict(command, raw_args):
    """CLI interface to predict
    """
    assert command == "predict"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='Run the model prediction.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Batch size to use in prediction')
    parser.add_argument("-n", "--num_workers", type=int, default=0,
                        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-k", "--keep_inputs", action='store_true',
                        help="Keep the inputs in the output file. ")
    parser.add_argument("-l", "--layer",
                        help="Which output layer to use to make the predictions. If specified," +
                        "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`")
    parser.add_argument("--singularity", action='store_true',
                        help="Run `kipoi predict` in the appropriate singularity container. "
                        "Containters will get downloaded to ~/.kipoi/envs/ or to "
                        "$SINGULARITY_CACHEDIR if set")
    parser.add_argument('-o', '--output', required=True, nargs="+",
                        help="Output files. File format is inferred from the file path ending. Available file formats are: " +
                        ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str_or_arglist(args.dataloader_args, parser)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".
                         format(ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)

    # singularity_command
    if args.singularity:
        from kipoi.cli.singularity import singularity_command
        logger.info("Running kipoi predict in the singularity container")
        # Drop the singularity flag
        raw_args = [x for x in raw_args if x != '--singularity']
        singularity_command(['kipoi', command] + raw_args,
                            args.model,
                            dataloader_kwargs,
                            output_files=args.output,
                            source=args.source,
                            dry_run=False)
        return None
    # --------------------------------------------
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        writer = writers.get_writer(output, metadata_schema=dl.get_output_schema().metadata)
        if writer is None:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit()
        else:
            use_writers.append(writer)
    output_writers = writers.MultipleBatchWriter(use_writers)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.get_output_schema().compatible_with_batch(batch):
            logger.warning("First batch of data is not compatible with the dataloader schema.")

        # make the prediction
        if args.layer is None:
            pred_batch = model.predict_on_batch(batch['inputs'])
        else:
            pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer)

        # write out the predictions, metadata (, inputs, targets)
        output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs)
        output_writers.batch_write(output_batch)

    output_writers.close()
    logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
Exemplo n.º 4
0
def cli_score_variants(command, raw_args):
    """CLI interface to score variants
    """
    # Updated argument names:
    # - scoring -> scores
    # - --vcf_path -> --input_vcf, -i
    # - --out_vcf_fpath -> --output_vcf, -o
    # - --output -> -e, --extra_output
    # - remove - -install_req
    # - scoring_kwargs -> score_kwargs
    AVAILABLE_FORMATS = [k for k in writers.FILE_SUFFIX_MAP if k != 'bed']
    assert command == "score_variants"
    parser = argparse.ArgumentParser(
        'kipoi veff {}'.format(command),
        description='Predict effect of SNVs using ISM.')
    parser.add_argument('model', help='Model name.')
    parser.add_argument(
        '--source',
        default="kipoi",
        choices=list(kipoi.config.model_sources().keys()),
        help='Model source to use. Specified in ~/.kipoi/config.yaml' +
        " under model_sources. " +
        "'dir' is an additional source referring to the local folder.")

    add_dataloader(parser=parser, with_args=True)

    parser.add_argument('-i', '--input_vcf', required=True, help='Input VCF.')
    parser.add_argument('-o',
                        '--output_vcf',
                        help='Output annotated VCF file path.',
                        default=None)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument(
        '-r',
        '--restriction_bed',
        default=None,
        help="Regions for prediction can only be subsets of this bed file")
    parser.add_argument(
        '-e',
        '--extra_output',
        type=str,
        default=None,
        required=False,
        help=
        "Additional output files in other (non-vcf) formats. File format is inferred from the file path ending"
        + ". Available file formats are: {0}".format(", ".join(
            ["." + k for k in AVAILABLE_FORMATS])))
    parser.add_argument(
        '-s',
        "--scores",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--score_kwargs",
        default="",
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scoring. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scoring. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")
    parser.add_argument(
        '-l',
        "--seq_length",
        type=int,
        default=None,
        help=
        "Optional parameter: Model input sequence length - necessary if the model does not have a "
        "pre-defined input sequence length.")
    parser.add_argument(
        '--std_var_id',
        action="store_true",
        help="If set then variant IDs in the annotated"
        " VCF will be replaced with a standardised, unique ID.")

    parser.add_argument(
        "--model_outputs",
        type=str,
        default=None,
        nargs="+",
        help=
        "Optional parameter: Only return predictions for the selected model outputs. Naming"
        "according to the definition in model.yaml > schema > targets > column_labels"
    )

    parser.add_argument(
        "--model_outputs_i",
        type=int,
        default=None,
        nargs="+",
        help=
        "Optional parameter: Only return predictions for the selected model outputs. Give integer"
        "indices of the selected model output(s).")

    parser.add_argument(
        "--singularity",
        action='store_true',
        help="Run `kipoi predict` in the appropriate singularity container. "
        "Containters will get downloaded to ~/.kipoi/envs/ or to "
        "$SINGULARITY_CACHEDIR if set")

    args = parser.parse_args(raw_args)

    # OBSOLETE
    # Make sure all the multi-model arguments like source, dataloader etc. fit together
    #_prepare_multi_model_args(args)

    # Check that all the folders exist
    file_exists(args.input_vcf, logger)

    if args.output_vcf is None and args.extra_output is None:
        logger.error(
            "One of the two needs to be specified: --output_vcf or --extra_output"
        )
        sys.exit(1)

    if args.extra_output is not None:
        dir_exists(os.path.dirname(args.extra_output), logger)
        ending = args.extra_output.split('.')[-1]
        if ending not in AVAILABLE_FORMATS:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, args.extra_output, AVAILABLE_FORMATS))
            sys.exit(1)

    # singularity_command
    if args.singularity:
        from kipoi.cli.singularity import singularity_command
        logger.info(
            "Running kipoi veff {} in the singularity container".format(
                command))

        # Drop the singularity flag
        raw_args = [x for x in raw_args if x != '--singularity']

        dataloader_kwargs = parse_json_file_str_or_arglist(
            args.dataloader_args)

        # create output files
        output_files = []
        if args.output_vcf is not None:
            output_files.append(args.output_vcf)
        if args.extra_output is not None:
            output_files.append(args.extra_output)

        singularity_command(['kipoi', 'veff', command] + raw_args,
                            model=args.model,
                            dataloader_kwargs=dataloader_kwargs,
                            output_files=output_files,
                            source=args.source,
                            dry_run=False)
        return None

    if not isinstance(args.scores, list):
        args.scores = [args.scores]

    score_kwargs = []
    if len(args.score_kwargs) > 0:
        score_kwargs = args.score_kwargs
        if len(args.scores) >= 1:
            # Check if all scoring functions should be used:
            if args.scores == ["all"]:
                if len(score_kwargs) >= 1:
                    raise ValueError(
                        "`--score_kwargs` cannot be defined in combination will `--scoring all`!"
                    )
            else:
                score_kwargs = [parse_json_file_str(el) for el in score_kwargs]
                if not len(args.score_kwargs) == len(score_kwargs):
                    raise ValueError(
                        "When defining `--score_kwargs` a JSON representation of arguments (or the "
                        "path of a file containing them) must be given for every "
                        "`--scores` function.")

    # VCF writer
    output_vcf_model = None
    if args.output_vcf is not None:
        dir_exists(os.path.dirname(args.output_vcf), logger)
        output_vcf_model = args.output_vcf

    # Other writers
    if args.extra_output is not None:
        dir_exists(os.path.dirname(args.extra_output), logger)
        extra_output = args.extra_output
        writer = writers.get_writer(extra_output, metadata_schema=None)
        assert writer is not None
        extra_writers = [SyncBatchWriter(writer)]
    else:
        extra_writers = []

    dataloader_arguments = parse_json_file_str_or_arglist(args.dataloader_args)

    # --------------------------------------------
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    # Load effect prediction related model info
    model_info = kipoi_veff.ModelInfoExtractor(model, Dl)

    if model_info.use_seq_only_rc:
        logger.info(
            'Model SUPPORTS simple reverse complementation of input DNA sequences.'
        )
    else:
        logger.info(
            'Model DOES NOT support simple reverse complementation of input DNA sequences.'
        )

    if output_vcf_model is not None:
        logger.info('Annotated VCF will be written to %s.' %
                    str(output_vcf_model))

    model_outputs = None
    if args.model_outputs is not None:
        model_outputs = args.model_outputs

    elif args.model_outputs_i is not None:
        model_outputs = args.model_outputs_i

    kipoi_veff.score_variants(model,
                              dataloader_arguments,
                              args.input_vcf,
                              output_vcf=output_vcf_model,
                              output_writers=extra_writers,
                              scores=args.scores,
                              score_kwargs=score_kwargs,
                              num_workers=args.num_workers,
                              batch_size=args.batch_size,
                              seq_length=args.seq_length,
                              std_var_id=args.std_var_id,
                              restriction_bed=args.restriction_bed,
                              return_predictions=False,
                              model_outputs=model_outputs)

    logger.info('Successfully predicted samples')