def test_sequence_config(): """Train, predict and evaluate on dummy data. create: YES Input args: Dataset validation_set: YES batch_size: None """ inputs = Array("x", np.random.random((100, 10))) outputs = Array('y', np.random.randint(2, size=(100, 1)), conditions=['random']) jseq = JangguSequence(inputs.data, outputs.data, batch_size=10, as_dict=False) assert len(jseq) == 10 for x, y, _ in jseq: assert x[0].shape == (10, 10) assert y[0].shape == (10, 1) break jseq = JangguSequence(inputs, outputs, batch_size=10, as_dict=False) assert len(jseq) == 10 for x, y, _ in jseq: assert x[0].shape == (10, 10) assert y[0].shape == (10, 1) break jseq = JangguSequence(inputs, outputs, batch_size=10, as_dict=True) assert len(jseq) == 10 for x, y, _ in jseq: assert x['x'].shape == (10, 10) assert y['y'].shape == (10, 1) break
def test_janggu_train_predict_sequence(tmpdir): """Train, predict and evaluate on dummy data. create: YES Input args: Dataset validation_set: YES batch_size: None """ os.environ['JANGGU_OUTPUT'] = tmpdir.strpath inputs = {'x': Array("x", np.random.random((100, 10)))} outputs = { 'y': Array('y', np.random.randint(2, size=(100, 1)), conditions=['random']) } jseq = JangguSequence(10, inputs, outputs) @inputlayer @outputdense('sigmoid') def _model(inputs, inp, oup, params): return inputs, inputs[0] bwm = Janggu.create(_model, inputs=jseq.inputs['x'], outputs=jseq.outputs['y'], name='nptest') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) print('storage', storage) print('env', os.environ['JANGGU_OUTPUT']) print('name', bwm.name) print('outputdir', bwm.outputdir) assert not os.path.exists(storage) bwm.fit(jseq, epochs=2, validation_data=jseq, use_multiprocessing=False) assert os.path.exists(storage) pred = bwm.predict(jseq, use_multiprocessing=False) np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs['x'])) np.testing.assert_equal(pred.shape, outputs['y'].shape) bwm.evaluate(jseq, use_multiprocessing=False)
def evaluate(self, inputs=None, outputs=None, # pylint: disable=too-many-locals batch_size=None, sample_weight=None, steps=None, datatags=None, callbacks=None, use_multiprocessing=False, workers=1): """Evaluates the performance. This method is used to evaluate a given model. All of the parameters are directly delegated the evalute_generator of the keras model. See https://keras.io/models/model/#methods. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for evaluating the model. outputs : :code:`Dataset`, list(Dataset) or None Output Dataset containing the training targets. If a Sequence is used for inputs, outputs will have no effect. batch_size : int or None Batch size. If set to None a batch size of 32 is used. sample_weight : np.array or None Sample weights. See https://keras.io. steps : int, None. Number of predict steps. If None, this value is determined from the dataset size and the batch_size. datatags : list(str) or None Tags to annotate the evaluation results. Default: None. callbacks : List(:code:`Scorer` or str) Scorer instances to be applied on the predictions. Furthermore, commonly used scoring metrics can be added by name, including 'roc', 'auroc', 'prc', 'auprc' for evaluating binary classification applications and 'cor' (for Pearson's correlation), 'mae', 'mse' and 'var_explained' for regression applications. use_multiprocessing : boolean Whether to use multiprocessing for the prediction. Default: False. workers : int Number of workers to use. Default: 1. Examples -------- .. code-block:: python model.evaluate(DATA, LABELS) # binary classification evaluation with callbacks model.evaluate(DATA, LABELS, callcacks=['auprc', 'auroc']) """ self.logger.info('Evaluate: %s', self.name) if isinstance(inputs, Sequence): inputs_ = _convert_data(self.kerasmodel, inputs.inputs, 'input_layers') outputs_ = _convert_data(self.kerasmodel, inputs.outputs, 'output_layers') self.logger.info('Using custom Sequence.') self.logger.info("Input:") self.__dim_logging(inputs_) self.logger.info("Output:") self.__dim_logging(outputs_) else: inputs_ = _convert_data(self.kerasmodel, inputs, 'input_layers') outputs_ = _convert_data(self.kerasmodel, outputs, 'output_layers') self.logger.info("Input:") self.__dim_logging(inputs_) self.logger.info("Output:") self.__dim_logging(outputs_) self.timer = time.time() if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): jseq = inputs else: jseq = JangguSequence(batch_size, inputs_, outputs_, sample_weight) try: values = self.kerasmodel.evaluate_generator( jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers) except Exception: # pragma: no cover self.logger.exception('evaluate_generator failed:') raise self.logger.info('#' * 40) values = _to_list(values) for i, value in enumerate(values): self.logger.info('%s: %f', self.kerasmodel.metrics_names[i], value) self.logger.info('#' * 40) self.logger.info("Evaluation finished in %1.3f s", time.time() - self.timer) preds = self.kerasmodel.predict_generator(jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers) preds = _convert_data(self.kerasmodel, preds, 'output_layers') for callback in callbacks or []: callback = get_scorer(callback) callback.score(self, preds, outputs=outputs_, datatags=datatags) return values
def predict(self, inputs, # pylint: disable=too-many-locals batch_size=None, verbose=0, steps=None, layername=None, datatags=None, callbacks=None, use_multiprocessing=False, workers=1): """Performs a prediction. This method predicts the targets. All of the parameters are directly delegated the predict_generator of the keras model. See https://keras.io/models/model/#methods. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for fitting the model. batch_size : int or None Batch size. If set to None a batch size of 32 is used. verbose : int Verbosity level. See https://keras.io. steps : int, None. Number of predict steps. If None, this value is determined from the dataset size and the batch_size. layername : str or None Layername for which the prediction should be performed. If None, the output layer will be used automatically. datatags : list(str) or None Tags to annotate the evaluation results. Default: None. callbacks : List(:code:`Scorer`) Scorer instances to be applied on the predictions. use_multiprocessing : boolean Whether to use multiprocessing for the prediction. Default: False. workers : int Number of workers to use. Default: 1. Examples -------- .. code-block:: python model.predict(DATA) """ if not isinstance(inputs, Sequence): inputs = _convert_data(self.kerasmodel, inputs, 'input_layers') self.logger.info('Predict: %s', self.name) if isinstance(inputs, Sequence): self.logger.info('using custom Sequence') else: self.logger.info("Input:") self.__dim_logging(inputs) self.timer = time.time() # if a desired layername is specified, the features # will be predicted. if layername: model = Janggu(self.kerasmodel.input, self.kerasmodel.get_layer(layername).output, name=self.name) else: model = self if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): jseq = inputs else: jseq = JangguSequence(batch_size, inputs, None, None) try: preds = model.kerasmodel.predict_generator( jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers, verbose=verbose) except Exception: # pragma: no cover self.logger.exception('predict_generator failed:') raise prd = _convert_data(model.kerasmodel, preds, 'output_layers') if layername is not None: # no need to set an extra datatag. # if layername is present, it will be added to the tags if datatags is None: datatags = [layername] else: datatags.append(layername) for callback in callbacks or []: callback.score(model, prd, datatags=datatags) return preds
def fit(self, # pylint: disable=too-many-locals inputs=None, outputs=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, use_multiprocessing=False, workers=1): """Model fitting. This method is used to fit a given model. Most of parameters are directly delegated the fit_generator of the keras model. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for fitting the model. outputs : :code:`Dataset`, list(Dataset) or None Output Dataset containing the training targets. If a Sequence is used for inputs, outputs will have no effect. batch_size : int or None Batch size. If set to None a batch size of 32 is used. epochs : int Number of epochs. Default: 1. verbose : int Verbosity level. See https://keras.io. callbacks : List(keras.callbacks.Callback) Callbacks to be applied during training. See https://keras.io/callbacks validation_data : tuple, Sequence or None Validation data can be a tuple (input_dataset, output_dataset), or (input_dataset, output_dataset, sample_weights) or a keras.utils.Sequence instance or a list of validation chromsomoes. The latter choice only works with when using Cover and Bioseq dataset. This allows you to train on a dedicated set of chromosomes and to validate the performance on respective heldout chromosomes. If None, validation is not applied. shuffle : boolean shuffle batches. Default: True. class_weight : dict Class weights. See https://keras.io. sample_weight : np.array or None Sample weights. See https://keras.io. initial_epoch : int Initial epoch at which to start training. steps_per_epoch : int, None. Number of steps per epoch. If None, this value is determined from the dataset size and the batch_size. use_multiprocessing : boolean Whether to use multiprocessing. See https://keras.io. Default: False. workers : int Number of workers to use in multiprocessing mode. Default: 1. Examples -------- .. code-block:: python model.fit(DATA, LABELS) """ if not isinstance(inputs, Sequence): inputs = _convert_data(self.kerasmodel, inputs, 'input_layers') outputs = _convert_data(self.kerasmodel, outputs, 'output_layers') hyper_params = { 'epochs': epochs, 'batch_size': batch_size, 'shuffle': shuffle, 'class_weight': class_weight, 'initial_epoch': initial_epoch, 'steps_per_epoch': steps_per_epoch, 'use_multiprocessing': use_multiprocessing, 'workers': workers } self.logger.info('Fit: %s', self.name) if isinstance(inputs, Sequence): self.logger.info('using custom Sequence') else: self.logger.info("Input:") self.__dim_logging(inputs) self.logger.info("Output:") self.__dim_logging(outputs) self.timer = time.time() history = None self.logger.info("Hyper-parameters:") for par_ in hyper_params: self.logger.info('%s: %s', par_, str(hyper_params[par_])) callbacks = [] if callbacks is None else callbacks callbacks.append(LambdaCallback(on_epoch_end=lambda epoch, logs: self.logger.info( "epoch %s: %s", epoch + 1, ' '.join(["{}=".format(k) + ('{:.4f}' if abs(logs[k]) > 1e-3 else '{:.4e}').format(logs[k]) for k in logs])))) if not os.path.exists(os.path.join(self.outputdir, 'evaluation')): os.mkdir(os.path.join(self.outputdir, 'evaluation')) if not os.path.exists(os.path.join(self.outputdir, 'evaluation', self.name)): os.mkdir(os.path.join(self.outputdir, 'evaluation', self.name)) callbacks.append(CSVLogger(os.path.join(self.outputdir, 'evaluation', self.name, 'training.log'))) if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): # input could be a sequence jseq = inputs else: jseq = JangguSequence(batch_size, inputs, outputs, sample_weight, shuffle=shuffle) if isinstance(validation_data, tuple): valinputs = _convert_data(self.kerasmodel, validation_data[0], 'input_layers') valoutputs = _convert_data(self.kerasmodel, validation_data[1], 'output_layers') sweights = validation_data[2] if len(validation_data) == 3 else None valjseq = JangguSequence(batch_size, valinputs, valoutputs, sweights, shuffle=False) elif isinstance(validation_data, Sequence): valjseq = validation_data elif isinstance(validation_data, list) and isinstance(validation_data[0], str): # if the validation data is a list of chromosomes that should # be used as validation dataset we end up here. # This is only possible, however, if all input and output datasets # are Cover or Bioseq dataset. if not all(hasattr(datum, 'gindexer') \ for datum in [jseq.inputs[k] for k in jseq.inputs] + [jseq.outputs[k] for k in jseq.outputs]): raise ValueError("Not all dataset are Cover or Bioseq dataset" " which is required for this options.") # then split the original dataset into training and validation set. train, val = split_train_test((jseq.inputs, jseq.outputs), validation_data) traininp, trainoup = train valinp, valoup = val self.logger.info("Split in training and validation set:") self.logger.info("Training-Input:") self.__dim_logging(traininp) self.logger.info("Training-Output:") self.__dim_logging(trainoup) self.logger.info("Validation-Input:") self.__dim_logging(valinp) self.logger.info("Validation-Output:") self.__dim_logging(valoup) jseq = JangguSequence(jseq.batch_size, _convert_data(self.kerasmodel, traininp, 'input_layers'), _convert_data(self.kerasmodel, trainoup, 'output_layers'), sample_weights=None, shuffle=jseq.shuffle) valjseq = JangguSequence(jseq.batch_size, _convert_data(self.kerasmodel, valinp, 'input_layers'), _convert_data(self.kerasmodel, valoup, 'output_layers'), sample_weights=None, shuffle=False) else: valjseq = None try: history = self.kerasmodel.fit_generator( jseq, epochs=epochs, validation_data=valjseq, class_weight=class_weight, initial_epoch=initial_epoch, shuffle=shuffle, use_multiprocessing=use_multiprocessing, max_queue_size=50, workers=workers, verbose=verbose, callbacks=callbacks) except Exception: # pragma: no cover # ignore the linter warning, the exception # is reraised anyways. self.logger.exception('fit_generator failed:') raise self.logger.info('#' * 40) for k in history.history: self.logger.info('%s: %f', k, history.history[k][-1]) self.logger.info('#' * 40) self.save() self._save_hyper(hyper_params) self.logger.info("Training finished after %1.3f s", time.time() - self.timer) return history
layer = Dense(1, activation='sigmoid')(layer) # the last one is used to make the dimensionality compatible with # the coverage dataset dimensions. # Alternatively, the ReduceDim dataset wrapper may be used to transform # the output to a 2D dataset object. output = Reshape((1, 1, 1), name="peaks")(layer) model = Model(xin, output) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) model.summary() trainseq = JangguSequence(DNA, LABELS, batch_size=32) valseq = JangguSequence(DNA_TEST, LABELS_TEST) hist = model.fit(trainseq, epochs=500, validation_data=valseq) print('#' * 40) print('loss: {}, acc: {}'.format(hist.history['loss'][-1], hist.history['acc'][-1])) print('#' * 40) # convert the prediction to a cover object pred = model.predict(valseq) cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer) print('Prediction score examples for Oct4') for i in range(4):