Exemplo n.º 1
0
def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.h5"))
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer = HDF5BatchWriter(tmpfile, chunk_size=4)

    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    with HDF5Reader(tmpfile) as f:
        assert np.all(
            list(f.batch_iter(2))[0]['metadata']['gene_id'] ==
            dl_batch['metadata']['gene_id'][:2])
        out = f.load_all()
        assert np.all(out['metadata']['gene_id'] == np.concatenate([
            dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id']
        ]))
        assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([
            dl_batch['metadata']['ranges']['chr'], dl_batch['metadata']
            ['ranges']['chr']
        ]))
        assert np.all(out['metadata']['ranges']["start"] == np.concatenate([
            dl_batch['metadata']['ranges']['start'], dl_batch['metadata']
            ['ranges']['start']
        ]))
        assert np.all(out['preds'][:3] == pred_batch_array)
Exemplo n.º 2
0
    def predict_example(self, batch_size=32, output_file=None, **kwargs):
        """Run model prediction for the example file

        # Arguments
            batch_size: batch_size
            output_file: if not None, inputs and predictions are stored to `output_file` path
            **kwargs: Further arguments passed to batch_iter
        """
        logger.info('Initialized data generator. Running batches...')

        from kipoi.writers import get_writer
        from kipoi.cli.main import prepare_batch

        if output_file is not None:
            output_file = os.path.abspath(output_file)
            if os.path.exists(output_file):
                raise ValueError(
                    "Output file: {} already exists.".format(output_file))
        with cd(self.dataloader_cls.source_dir):
            # init the dataloader
            dl = self.dataloader_cls.init_example()
            logger.info('Returned data schema correct')

            if output_file is not None:
                writer = get_writer(output_file,
                                    dl.get_output_schema().metadata, **kwargs)

            it = dl.batch_iter(batch_size=batch_size)

            # test that all predictions go through
            pred_list = []
            for i, batch in enumerate(tqdm(it)):
                if i == 0 and not self.dataloader_cls.get_output_schema(
                ).compatible_with_batch(batch):
                    logger.warning(
                        "First batch of data is not compatible with the dataloader schema."
                    )
                pred_batch = self.model.predict_on_batch(batch['inputs'])
                if 'keep_metadata' in kwargs and kwargs.get(
                        'keep_metadata') and 'metadata' in batch:
                    pred_list.append({
                        'preds': pred_batch,
                        'metadata': batch['metadata']
                    })
                else:
                    pred_list.append(pred_batch)
                if output_file is not None:
                    output_batch = prepare_batch(
                        batch,
                        pred_batch,
                        keep_inputs=True,
                        keep_metadata='keep_metadata' in kwargs
                        and kwargs.get('keep_metadata'))
                    writer.batch_write(output_batch)

            if output_file is not None:
                writer.close()

        logger.info('predict_example done!')
        return numpy_collate_concat(pred_list)
Exemplo n.º 3
0
def get_example_data(example, layer, writer=None):
    example_dir = "examples/{0}".format(example)
    if INSTALL_REQ:
        install_model_requirements(example_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(example_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir")
    #
    with open(example_dir + "/example_files/test.json", "r") as ifh:
        dataloader_arguments = json.load(ifh)

    for k in dataloader_arguments:
        dataloader_arguments[k] = "example_files/" + dataloader_arguments[k]

    outputs = []
    with cd(model.source_dir):
        dl = Dataloader(**dataloader_arguments)
        it = dl.batch_iter(batch_size=32, num_workers=0)

        # Loop through the data, make predictions, save the output
        for i, batch in enumerate(tqdm(it)):

            # make the prediction
            pred_batch = model.input_grad(batch['inputs'], avg_func="sum", layer=layer,
                                          final_layer=False)
            # write out the predictions, metadata (, inputs, targets)
            # always keep the inputs so that input*grad can be generated!
            output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
            if writer is not None:
                writer.batch_write(output_batch)
            outputs.append(output_batch)
        if writer is not None:
            writer.close()
    return numpy_collate(outputs)
Exemplo n.º 4
0
def test_BedBatchWriter(dl_batch, pred_batch_array, metadata_schema, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.tsv"))
    writer = BedBatchWriter(tmpfile, metadata_schema=metadata_schema)
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    df = pd.read_csv(tmpfile, sep="\t")

    assert list(df.columns) == [
        'chr', 'start', 'end', 'name', 'score', 'strand', 'preds/0', 'preds/1',
        'preds/2'
    ]
    assert list(df['name']) == [0, 1, 2, 0, 1, 2]
Exemplo n.º 5
0
def test_TsvBatchWriter_array(dl_batch, pred_batch_array, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.tsv"))
    writer = TsvBatchWriter(tmpfile)
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    df = pd.read_csv(tmpfile, sep="\t")

    assert set(list(df.columns)) == {
        'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr',
        'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id',
        'preds/0', 'preds/1', 'preds/2'
    }
    assert list(df['metadata/ranges/id']) == [0, 1, 2, 0, 1, 2]
Exemplo n.º 6
0
def test_ParquetBatchWriter_array(dl_batch, pred_batch_array, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.pq"))
    writer = ParquetBatchWriter(tmpfile)
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    df = pd.read_parquet(tmpfile, engine='fastparquet')

    assert set(list(df.columns)) == {
        'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr',
        'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id',
        'preds/0', 'preds/1', 'preds/2'
    }
    assert list(df['metadata/ranges/id']) == ['0', '1', '2', '0', '1', '2']
Exemplo n.º 7
0
def test_MultipleBatchWriter(dl_batch, pred_batch_array, tmpdir):
    tmpdir = tmpdir.mkdir("example")
    h5_tmpfile = str(tmpdir.join("out.h5"))
    tsv_tmpfile = str(tmpdir.join("out.tsv"))
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer = MultipleBatchWriter(
        [TsvBatchWriter(tsv_tmpfile),
         HDF5BatchWriter(h5_tmpfile)])
    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    assert os.path.exists(h5_tmpfile)
    assert os.path.exists(tsv_tmpfile)
    df = pd.read_csv(tsv_tmpfile, sep="\t")
    assert set(list(df.columns)) == {
        'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr',
        'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id',
        'preds/0', 'preds/1', 'preds/2'
    }
    assert list(df['metadata/ranges/id']) == [0, 1, 2, 0, 1, 2]
Exemplo n.º 8
0
    def predict_to_file(self,
                        output_file,
                        dataloader_kwargs,
                        batch_size=32,
                        keep_inputs=False,
                        keep_metadata=False,
                        **kwargs):
        """Make predictions and write them iteratively to a file

        # Arguments
            output_file: output file path. File format is inferred from the file path ending. Available file formats are:
                 'bed', 'h5', 'hdf5', 'tsv'
            dataloader_kwargs: Keyword arguments passed to the dataloader
            batch_size: Batch size used for the dataloader
            keep_inputs: if True, inputs and targets will also be written to the output file.
            keep_metadata: if True, metadata will also be written to the output file.
            **kwargs: Further arguments passed to batch_iter
        """
        from kipoi.writers import get_writer
        from kipoi.cli.main import prepare_batch

        # setup dataloader
        validate_kwargs(self.dataloader_cls, dataloader_kwargs)
        dl = self.dataloader_cls(**dataloader_kwargs)
        it = dl.batch_iter(batch_size=batch_size, **kwargs)
        writer = get_writer(output_file,
                            dl.get_output_schema().metadata, **kwargs)

        for i, batch in enumerate(tqdm(it)):
            if i == 0 and not self.dataloader_cls.get_output_schema(
            ).compatible_with_batch(batch):
                logger.warning(
                    "First batch of data is not compatible with the dataloader schema."
                )
            pred_batch = self.model.predict_on_batch(batch['inputs'])
            output_batch = prepare_batch(batch,
                                         pred_batch,
                                         keep_inputs=keep_inputs,
                                         keep_metadata=keep_metadata)
            writer.batch_write(output_batch)
        writer.close()