def test_split_train_test(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, binsize=200, stepsize=200, order=1, store_whole_genome=True) traindna, testdna = split_train_test(dna, holdout_chroms='chr2') assert len(traindna) == 50 assert len(testdna) == 50 assert len(dna) == len(traindna) + len(testdna) traindna, testdna = split_train_test([dna, dna], holdout_chroms='chr2') assert len(traindna[0]) == 50 assert len(testdna[0]) == 50 assert len(dna) == len(traindna[0]) + len(testdna[0])
def fit(self, # pylint: disable=too-many-locals inputs=None, outputs=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, use_multiprocessing=False, workers=1): """Model fitting. This method is used to fit a given model. Most of parameters are directly delegated the fit_generator of the keras model. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for fitting the model. outputs : :code:`Dataset`, list(Dataset) or None Output Dataset containing the training targets. If a Sequence is used for inputs, outputs will have no effect. batch_size : int or None Batch size. If set to None a batch size of 32 is used. epochs : int Number of epochs. Default: 1. verbose : int Verbosity level. See https://keras.io. callbacks : List(keras.callbacks.Callback) Callbacks to be applied during training. See https://keras.io/callbacks validation_data : tuple, Sequence or None Validation data can be a tuple (input_dataset, output_dataset), or (input_dataset, output_dataset, sample_weights) or a keras.utils.Sequence instance or a list of validation chromsomoes. The latter choice only works with when using Cover and Bioseq dataset. This allows you to train on a dedicated set of chromosomes and to validate the performance on respective heldout chromosomes. If None, validation is not applied. shuffle : boolean shuffle batches. Default: True. class_weight : dict Class weights. See https://keras.io. sample_weight : np.array or None Sample weights. See https://keras.io. initial_epoch : int Initial epoch at which to start training. steps_per_epoch : int, None. Number of steps per epoch. If None, this value is determined from the dataset size and the batch_size. use_multiprocessing : boolean Whether to use multiprocessing. See https://keras.io. Default: False. workers : int Number of workers to use in multiprocessing mode. Default: 1. Examples -------- .. code-block:: python model.fit(DATA, LABELS) """ if not isinstance(inputs, Sequence): inputs = _convert_data(self.kerasmodel, inputs, 'input_layers') outputs = _convert_data(self.kerasmodel, outputs, 'output_layers') hyper_params = { 'epochs': epochs, 'batch_size': batch_size, 'shuffle': shuffle, 'class_weight': class_weight, 'initial_epoch': initial_epoch, 'steps_per_epoch': steps_per_epoch, 'use_multiprocessing': use_multiprocessing, 'workers': workers } self.logger.info('Fit: %s', self.name) if isinstance(inputs, Sequence): self.logger.info('using custom Sequence') else: self.logger.info("Input:") self.__dim_logging(inputs) self.logger.info("Output:") self.__dim_logging(outputs) self.timer = time.time() history = None self.logger.info("Hyper-parameters:") for par_ in hyper_params: self.logger.info('%s: %s', par_, str(hyper_params[par_])) callbacks = [] if callbacks is None else callbacks callbacks.append(LambdaCallback(on_epoch_end=lambda epoch, logs: self.logger.info( "epoch %s: %s", epoch + 1, ' '.join(["{}=".format(k) + ('{:.4f}' if abs(logs[k]) > 1e-3 else '{:.4e}').format(logs[k]) for k in logs])))) if not os.path.exists(os.path.join(self.outputdir, 'evaluation')): os.mkdir(os.path.join(self.outputdir, 'evaluation')) if not os.path.exists(os.path.join(self.outputdir, 'evaluation', self.name)): os.mkdir(os.path.join(self.outputdir, 'evaluation', self.name)) callbacks.append(CSVLogger(os.path.join(self.outputdir, 'evaluation', self.name, 'training.log'))) if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): # input could be a sequence jseq = inputs else: jseq = JangguSequence(batch_size, inputs, outputs, sample_weight, shuffle=shuffle) if isinstance(validation_data, tuple): valinputs = _convert_data(self.kerasmodel, validation_data[0], 'input_layers') valoutputs = _convert_data(self.kerasmodel, validation_data[1], 'output_layers') sweights = validation_data[2] if len(validation_data) == 3 else None valjseq = JangguSequence(batch_size, valinputs, valoutputs, sweights, shuffle=False) elif isinstance(validation_data, Sequence): valjseq = validation_data elif isinstance(validation_data, list) and isinstance(validation_data[0], str): # if the validation data is a list of chromosomes that should # be used as validation dataset we end up here. # This is only possible, however, if all input and output datasets # are Cover or Bioseq dataset. if not all(hasattr(datum, 'gindexer') \ for datum in [jseq.inputs[k] for k in jseq.inputs] + [jseq.outputs[k] for k in jseq.outputs]): raise ValueError("Not all dataset are Cover or Bioseq dataset" " which is required for this options.") # then split the original dataset into training and validation set. train, val = split_train_test((jseq.inputs, jseq.outputs), validation_data) traininp, trainoup = train valinp, valoup = val self.logger.info("Split in training and validation set:") self.logger.info("Training-Input:") self.__dim_logging(traininp) self.logger.info("Training-Output:") self.__dim_logging(trainoup) self.logger.info("Validation-Input:") self.__dim_logging(valinp) self.logger.info("Validation-Output:") self.__dim_logging(valoup) jseq = JangguSequence(jseq.batch_size, _convert_data(self.kerasmodel, traininp, 'input_layers'), _convert_data(self.kerasmodel, trainoup, 'output_layers'), sample_weights=None, shuffle=jseq.shuffle) valjseq = JangguSequence(jseq.batch_size, _convert_data(self.kerasmodel, valinp, 'input_layers'), _convert_data(self.kerasmodel, valoup, 'output_layers'), sample_weights=None, shuffle=False) else: valjseq = None try: history = self.kerasmodel.fit_generator( jseq, epochs=epochs, validation_data=valjseq, class_weight=class_weight, initial_epoch=initial_epoch, shuffle=shuffle, use_multiprocessing=use_multiprocessing, max_queue_size=50, workers=workers, verbose=verbose, callbacks=callbacks) except Exception: # pragma: no cover # ignore the linter warning, the exception # is reraised anyways. self.logger.exception('fit_generator failed:') raise self.logger.info('#' * 40) for k in history.history: self.logger.info('%s: %f', k, history.history[k][-1]) self.logger.info('#' * 40) self.save() self._save_hyper(hyper_params) self.logger.info("Training finished after %1.3f s", time.time() - self.timer) return history
def objective(params): print(params) try: train_data = get_data(params) train_data, test = split_train_test(train_data, [test_chrom]) train, val = split_train_test(train_data, [params['val_chrom']]) # define a keras model only based on DNA K.clear_session() if params['inputs'] == 'epi_dna': dnam = Janggu.create_by_name('cage_promoters_dna_only') epim = Janggu.create_by_name('cage_promoters_epi_only') layer = Concatenate()([ dnam.kerasmodel.layers[-2].output, epim.kerasmodel.layers[-2].output ]) layer = Dense(1, name='geneexpr')(layer) model = Janggu([dnam.kerasmodel.input] + epim.kerasmodel.input, layer, name='cage_promoters_epi_dna') if not params['pretrained']: # This part randomly reinitializes the network # so that we can train it from scratch newjointmodel = model_from_json(model.kerasmodel.to_json()) newjointmodel = Janggu( newjointmodel.inputs, newjointmodel.outputs, name='cage_promoters_epi_dna_randominit') model = newjointmodel else: model = Janggu.create(get_model, params, train_data[0], train_data[1], name='cage_promoters_{}'.format( params['inputs'])) except ValueError: main_logger.exception('objective:') return {'status': 'fail'} model.compile(optimizer=get_opt(params['opt']), loss='mae', metrics=['mse']) hist = model.fit( train_data[0], train_data[1], epochs=params['epochs'], batch_size=64, validation_data=[params['val_chrom']], callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]) print('#' * 40) for key in hist.history: print('{}: {}'.format(key, hist.history[key][-1])) print('#' * 40) pred_train = model.predict(train[0]) pred_val = model.predict(val[0]) pred_test = model.predict(test[0]) model.evaluate(train[0], train[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['train']) mae_val = model.evaluate(val[0], val[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['val']) mae_val = mae_val[0] model.evaluate(test[0], test[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['test']) cor_train = np.corrcoef(train[1][:][:, 0], pred_train[:, 0])[0, 1] cor_val = np.corrcoef(val[1][:][:, 0], pred_val[:, 0])[0, 1] cor_test = np.corrcoef(test[1][:][:, 0], pred_test[:, 0])[0, 1] model.summary() main_logger.info('cor [train/val/test]: {:.2f}/{:.2f}/{:.2f}'.format( cor_train, cor_val, cor_test)) return { 'loss': mae_val, 'status': 'ok', 'all_losses': hist.history, 'cor_train': cor_train, 'cor_val': cor_val, 'cor_test': cor_test, 'model_config': model.kerasmodel.to_json(), 'model_weights': model.kerasmodel.get_weights(), 'concrete_params': params }
shared_space['inputs'] = 'epi_dna' shared_space['pretrained'] = False res = objective(shared_space) write_results(shared_space, res) else: print('no training') shared_space['val_chrom'] = "chr22" shared_space['order'] = dnaorder shared_space['pretrained'] = False shared_space['seq_dropout'] = 0.2 shared_space['inputs'] = 'epi_dna' params = shared_space train_data = get_data(params) train, test = split_train_test(train_data, [test_chrom]) model = Janggu.create_by_name('cage_promoters_epi_dna') testpred = model.predict(test[0]) fig, ax = plt.subplots() ax.scatter(test[1][:], testpred) ax.set_xlabel('Observed normalized CAGE signal') ax.set_ylabel('Predicted normalized CAGE signal') fig.savefig( os.path.join(os.environ['JANGGU_OUTPUT'], 'cage_promoter_testchrom_agreement.png')) fig, ax = plt.subplots() ax.scatter(test[1][:], testpred)
def get_data(params): binsize = params['binsize'] # PEAKS LABELS = ReduceDim(Cover.create_from_bed('peaks', bedfiles=PEAKS, roi=ROI, binsize=binsize, conditions=['JunD'], resolution=binsize, store_whole_genome=True, storage='sparse', cache=True), aggregator='max') # training on chr1, validation on chr2, test on chr3 with swapped Dnase samples LABELS, LABELS_TEST = split_train_test(LABELS, 'chr3') LABELS_TRAIN, LABELS_VAL = split_train_test(LABELS, 'chr2') if params['type'] in ['dna_only', 'dnase_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI, binsize=binsize, flank=dnaflank, order=order, cache=True, store_whole_genome=True) DNA, DNA_TEST = split_train_test(DNA, 'chr3') DNA_TRAIN, DNA_VAL = split_train_test(DNA, 'chr2') if params['type'] in ['dnase_bam_only', 'dnase_dna']: dnaseflank = params['dnaseflank'] # ACCESSIBILITY ACCESS_TEST = Cover.create_from_bam( 'dnase', bamfiles=[DNASE_STAM_ENCODE, DNASE_STAM_ROADMAP], roi=ROI, binsize=binsize, conditions=['Encode', 'Roadmap'], flank=dnaseflank, resolution=50, normalizer=params['normalize'], store_whole_genome=True, cache=True) ACCESS = Cover.create_from_bam( 'dnase', roi=ROI, bamfiles=[DNASE_STAM_ROADMAP, DNASE_STAM_ENCODE], binsize=binsize, conditions=['Roadmap', 'Encode'], resolution=50, flank=dnaseflank, normalizer=params['normalize'], store_whole_genome=True, cache=True) _, ACCESS_TEST = split_train_test(ACCESS_TEST, 'chr3') ACCESS, _ = split_train_test(ACCESS, 'chr3') ACCESS_TRAIN, ACCESS_VAL = split_train_test(ACCESS, 'chr2') if params['type'] in ['dna_dnase', 'dnase_bam_only']: if params['augment'] == 'orient': ACCESS_TRAIN = RandomOrientation(ACCESS_TRAIN) if params['augment'] == 'scale': ACCESS_TRAIN = RandomSignalScale(ACCESS_TRAIN, 0.1) if params['augment'] == 'both': ACCESS_TRAIN = RandomSignalScale(RandomOrientation(ACCESS_TRAIN), 0.1) if params['type'] == 'dna_only': return (DNA_TRAIN, LABELS_TRAIN), (DNA_VAL, LABELS_VAL), \ (DNA_TEST, LABELS_TEST) elif params['type'] == 'dnase_dna': return ([DNA_TRAIN, ACCESS_TRAIN], LABELS_TRAIN), \ ([DNA_VAL, ACCESS_VAL], LABELS_VAL),\ ([DNA_TEST, ACCESS_TEST], LABELS_TEST) elif params['type'] in ['dnase_bam_only']: return ([ACCESS_TRAIN], LABELS_TRAIN), \ ([ACCESS_VAL], LABELS_VAL), \ ([ACCESS_TEST], LABELS_TEST)