示例#1
0
def test_split_train_test():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       binsize=200,
                                       stepsize=200,
                                       order=1,
                                       store_whole_genome=True)

    traindna, testdna = split_train_test(dna, holdout_chroms='chr2')

    assert len(traindna) == 50
    assert len(testdna) == 50
    assert len(dna) == len(traindna) + len(testdna)

    traindna, testdna = split_train_test([dna, dna], holdout_chroms='chr2')

    assert len(traindna[0]) == 50
    assert len(testdna[0]) == 50
    assert len(dna) == len(traindna[0]) + len(testdna[0])
示例#2
0
    def fit(self,  # pylint: disable=too-many-locals
            inputs=None,
            outputs=None,
            batch_size=None,
            epochs=1,
            verbose=1,
            callbacks=None,
            validation_data=None,
            shuffle=True,
            class_weight=None,
            sample_weight=None,
            initial_epoch=0,
            steps_per_epoch=None,
            use_multiprocessing=False,
            workers=1):
        """Model fitting.

        This method is used to fit a given model.
        Most of parameters are directly delegated the
        fit_generator of the keras model.

        Parameters
        ----------
        inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence)
            Input Dataset or Sequence to use for fitting the model.
        outputs : :code:`Dataset`, list(Dataset) or None
            Output Dataset containing the training targets. If a Sequence
            is used for inputs, outputs will have no effect.
        batch_size : int or None
            Batch size. If set to None a batch size of 32 is used.
        epochs : int
            Number of epochs. Default: 1.
        verbose : int
            Verbosity level. See https://keras.io.
        callbacks : List(keras.callbacks.Callback)
            Callbacks to be applied during training. See https://keras.io/callbacks
        validation_data : tuple, Sequence or None
            Validation data can be a tuple (input_dataset, output_dataset),
            or (input_dataset, output_dataset, sample_weights) or
            a keras.utils.Sequence instance or a list of validation chromsomoes.
            The latter choice only works with when using Cover and Bioseq dataset.
            This allows you to train on a dedicated set of chromosomes
            and to validate the performance on respective heldout chromosomes.
            If None, validation is not applied.
        shuffle : boolean
            shuffle batches. Default: True.
        class_weight : dict
            Class weights. See https://keras.io.
        sample_weight : np.array or None
            Sample weights. See https://keras.io.
        initial_epoch : int
            Initial epoch at which to start training.
        steps_per_epoch : int, None.
            Number of steps per epoch. If None, this value is determined from
            the dataset size and the batch_size.
        use_multiprocessing : boolean
            Whether to use multiprocessing. See https://keras.io. Default: False.
        workers : int
            Number of workers to use in multiprocessing mode. Default: 1.



        Examples
        --------

        .. code-block:: python

          model.fit(DATA, LABELS)

        """

        if not isinstance(inputs, Sequence):
            inputs = _convert_data(self.kerasmodel, inputs, 'input_layers')
            outputs = _convert_data(self.kerasmodel, outputs, 'output_layers')

        hyper_params = {
            'epochs': epochs,
            'batch_size': batch_size,
            'shuffle': shuffle,
            'class_weight': class_weight,
            'initial_epoch': initial_epoch,
            'steps_per_epoch': steps_per_epoch,
            'use_multiprocessing': use_multiprocessing,
            'workers': workers
        }

        self.logger.info('Fit: %s', self.name)
        if isinstance(inputs, Sequence):
            self.logger.info('using custom Sequence')
        else:
            self.logger.info("Input:")
            self.__dim_logging(inputs)
            self.logger.info("Output:")
            self.__dim_logging(outputs)
        self.timer = time.time()
        history = None
        self.logger.info("Hyper-parameters:")
        for par_ in hyper_params:
            self.logger.info('%s: %s', par_, str(hyper_params[par_]))

        callbacks = [] if callbacks is None else callbacks

        callbacks.append(LambdaCallback(on_epoch_end=lambda epoch, logs: self.logger.info(
            "epoch %s: %s",
            epoch + 1,
            ' '.join(["{}=".format(k) +
                      ('{:.4f}' if
                       abs(logs[k]) > 1e-3
                       else '{:.4e}').format(logs[k]) for k in logs]))))

        if not os.path.exists(os.path.join(self.outputdir, 'evaluation')):
            os.mkdir(os.path.join(self.outputdir, 'evaluation'))
        if not os.path.exists(os.path.join(self.outputdir, 'evaluation', self.name)):
            os.mkdir(os.path.join(self.outputdir, 'evaluation', self.name))

        callbacks.append(CSVLogger(os.path.join(self.outputdir,
                                                'evaluation',
                                                self.name,
                                                'training.log')))

        if not batch_size:
            batch_size = 32

        if isinstance(inputs, Sequence):
            # input could be a sequence
            jseq = inputs
        else:
            jseq = JangguSequence(batch_size, inputs, outputs, sample_weight, shuffle=shuffle)

        if isinstance(validation_data, tuple):
            valinputs = _convert_data(self.kerasmodel, validation_data[0],
                                      'input_layers')
            valoutputs = _convert_data(self.kerasmodel, validation_data[1],
                                       'output_layers')
            sweights = validation_data[2] if len(validation_data) == 3 else None
            valjseq = JangguSequence(batch_size, valinputs, valoutputs, sweights, shuffle=False)
        elif isinstance(validation_data, Sequence):
            valjseq = validation_data
        elif isinstance(validation_data, list) and isinstance(validation_data[0], str):
            # if the validation data is a list of chromosomes that should
            # be used as validation dataset we end up here.

            # This is only possible, however, if all input and output datasets
            # are Cover or Bioseq dataset.
            if not all(hasattr(datum, 'gindexer') \
                for datum in [jseq.inputs[k] for k in jseq.inputs] +
                       [jseq.outputs[k] for k in jseq.outputs]):
                raise ValueError("Not all dataset are Cover or Bioseq dataset"
                                 " which is required for this options.")

            # then split the original dataset into training and validation set.
            train, val = split_train_test((jseq.inputs, jseq.outputs), validation_data)

            traininp, trainoup = train
            valinp, valoup = val

            self.logger.info("Split in training and validation set:")
            self.logger.info("Training-Input:")
            self.__dim_logging(traininp)
            self.logger.info("Training-Output:")
            self.__dim_logging(trainoup)
            self.logger.info("Validation-Input:")
            self.__dim_logging(valinp)
            self.logger.info("Validation-Output:")
            self.__dim_logging(valoup)
            jseq = JangguSequence(jseq.batch_size,
                                  _convert_data(self.kerasmodel, traininp,
                                                'input_layers'),
                                  _convert_data(self.kerasmodel, trainoup,
                                                'output_layers'),
                                  sample_weights=None, shuffle=jseq.shuffle)
            valjseq = JangguSequence(jseq.batch_size,
                                     _convert_data(self.kerasmodel, valinp,
                                                   'input_layers'),
                                     _convert_data(self.kerasmodel, valoup,
                                                   'output_layers'),
                                     sample_weights=None, shuffle=False)

        else:
            valjseq = None


        try:
            history = self.kerasmodel.fit_generator(
                jseq,
                epochs=epochs,
                validation_data=valjseq,
                class_weight=class_weight,
                initial_epoch=initial_epoch,
                shuffle=shuffle,
                use_multiprocessing=use_multiprocessing,
                max_queue_size=50,
                workers=workers,
                verbose=verbose,
                callbacks=callbacks)
        except Exception:  # pragma: no cover
            # ignore the linter warning, the exception
            # is reraised anyways.
            self.logger.exception('fit_generator failed:')
            raise

        self.logger.info('#' * 40)
        for k in history.history:
            self.logger.info('%s: %f', k, history.history[k][-1])
        self.logger.info('#' * 40)

        self.save()
        self._save_hyper(hyper_params)

        self.logger.info("Training finished after %1.3f s",
                         time.time() - self.timer)
        return history
示例#3
0
def objective(params):
    print(params)
    try:
        train_data = get_data(params)
        train_data, test = split_train_test(train_data, [test_chrom])
        train, val = split_train_test(train_data, [params['val_chrom']])
        # define a keras model only based on DNA
        K.clear_session()
        if params['inputs'] == 'epi_dna':
            dnam = Janggu.create_by_name('cage_promoters_dna_only')
            epim = Janggu.create_by_name('cage_promoters_epi_only')
            layer = Concatenate()([
                dnam.kerasmodel.layers[-2].output,
                epim.kerasmodel.layers[-2].output
            ])
            layer = Dense(1, name='geneexpr')(layer)
            model = Janggu([dnam.kerasmodel.input] + epim.kerasmodel.input,
                           layer,
                           name='cage_promoters_epi_dna')

            if not params['pretrained']:
                # This part randomly reinitializes the network
                # so that we can train it from scratch
                newjointmodel = model_from_json(model.kerasmodel.to_json())

                newjointmodel = Janggu(
                    newjointmodel.inputs,
                    newjointmodel.outputs,
                    name='cage_promoters_epi_dna_randominit')
                model = newjointmodel
        else:
            model = Janggu.create(get_model,
                                  params,
                                  train_data[0],
                                  train_data[1],
                                  name='cage_promoters_{}'.format(
                                      params['inputs']))
    except ValueError:
        main_logger.exception('objective:')
        return {'status': 'fail'}
    model.compile(optimizer=get_opt(params['opt']),
                  loss='mae',
                  metrics=['mse'])
    hist = model.fit(
        train_data[0],
        train_data[1],
        epochs=params['epochs'],
        batch_size=64,
        validation_data=[params['val_chrom']],
        callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])
    print('#' * 40)
    for key in hist.history:
        print('{}: {}'.format(key, hist.history[key][-1]))
    print('#' * 40)
    pred_train = model.predict(train[0])
    pred_val = model.predict(val[0])
    pred_test = model.predict(test[0])
    model.evaluate(train[0],
                   train[1],
                   callbacks=['var_explained', 'mse', 'mae', 'cor'],
                   datatags=['train'])
    mae_val = model.evaluate(val[0],
                             val[1],
                             callbacks=['var_explained', 'mse', 'mae', 'cor'],
                             datatags=['val'])
    mae_val = mae_val[0]
    model.evaluate(test[0],
                   test[1],
                   callbacks=['var_explained', 'mse', 'mae', 'cor'],
                   datatags=['test'])

    cor_train = np.corrcoef(train[1][:][:, 0], pred_train[:, 0])[0, 1]
    cor_val = np.corrcoef(val[1][:][:, 0], pred_val[:, 0])[0, 1]
    cor_test = np.corrcoef(test[1][:][:, 0], pred_test[:, 0])[0, 1]

    model.summary()
    main_logger.info('cor [train/val/test]: {:.2f}/{:.2f}/{:.2f}'.format(
        cor_train, cor_val, cor_test))
    return {
        'loss': mae_val,
        'status': 'ok',
        'all_losses': hist.history,
        'cor_train': cor_train,
        'cor_val': cor_val,
        'cor_test': cor_test,
        'model_config': model.kerasmodel.to_json(),
        'model_weights': model.kerasmodel.get_weights(),
        'concrete_params': params
    }
示例#4
0
        shared_space['inputs'] = 'epi_dna'
        shared_space['pretrained'] = False
        res = objective(shared_space)
        write_results(shared_space, res)
else:
    print('no training')

shared_space['val_chrom'] = "chr22"
shared_space['order'] = dnaorder
shared_space['pretrained'] = False
shared_space['seq_dropout'] = 0.2
shared_space['inputs'] = 'epi_dna'
params = shared_space
train_data = get_data(params)
train, test = split_train_test(train_data, [test_chrom])

model = Janggu.create_by_name('cage_promoters_epi_dna')

testpred = model.predict(test[0])

fig, ax = plt.subplots()
ax.scatter(test[1][:], testpred)
ax.set_xlabel('Observed normalized CAGE signal')
ax.set_ylabel('Predicted normalized CAGE signal')
fig.savefig(
    os.path.join(os.environ['JANGGU_OUTPUT'],
                 'cage_promoter_testchrom_agreement.png'))

fig, ax = plt.subplots()
ax.scatter(test[1][:], testpred)
示例#5
0
def get_data(params):
    binsize = params['binsize']

    # PEAKS
    LABELS = ReduceDim(Cover.create_from_bed('peaks',
                                             bedfiles=PEAKS,
                                             roi=ROI,
                                             binsize=binsize,
                                             conditions=['JunD'],
                                             resolution=binsize,
                                             store_whole_genome=True,
                                             storage='sparse',
                                             cache=True),
                       aggregator='max')

    # training on chr1, validation on chr2, test on chr3 with swapped Dnase samples
    LABELS, LABELS_TEST = split_train_test(LABELS, 'chr3')
    LABELS_TRAIN, LABELS_VAL = split_train_test(LABELS, 'chr2')
    if params['type'] in ['dna_only', 'dnase_dna']:
        dnaflank = params['dnaflank']
        order = params['order']
        # DNA
        DNA = Bioseq.create_from_refgenome('dna',
                                           refgenome=REFGENOME,
                                           roi=ROI,
                                           binsize=binsize,
                                           flank=dnaflank,
                                           order=order,
                                           cache=True,
                                           store_whole_genome=True)

        DNA, DNA_TEST = split_train_test(DNA, 'chr3')
        DNA_TRAIN, DNA_VAL = split_train_test(DNA, 'chr2')
    if params['type'] in ['dnase_bam_only', 'dnase_dna']:

        dnaseflank = params['dnaseflank']
        # ACCESSIBILITY
        ACCESS_TEST = Cover.create_from_bam(
            'dnase',
            bamfiles=[DNASE_STAM_ENCODE, DNASE_STAM_ROADMAP],
            roi=ROI,
            binsize=binsize,
            conditions=['Encode', 'Roadmap'],
            flank=dnaseflank,
            resolution=50,
            normalizer=params['normalize'],
            store_whole_genome=True,
            cache=True)
        ACCESS = Cover.create_from_bam(
            'dnase',
            roi=ROI,
            bamfiles=[DNASE_STAM_ROADMAP, DNASE_STAM_ENCODE],
            binsize=binsize,
            conditions=['Roadmap', 'Encode'],
            resolution=50,
            flank=dnaseflank,
            normalizer=params['normalize'],
            store_whole_genome=True,
            cache=True)

        _, ACCESS_TEST = split_train_test(ACCESS_TEST, 'chr3')
        ACCESS, _ = split_train_test(ACCESS, 'chr3')
        ACCESS_TRAIN, ACCESS_VAL = split_train_test(ACCESS, 'chr2')

    if params['type'] in ['dna_dnase', 'dnase_bam_only']:
        if params['augment'] == 'orient':
            ACCESS_TRAIN = RandomOrientation(ACCESS_TRAIN)
        if params['augment'] == 'scale':
            ACCESS_TRAIN = RandomSignalScale(ACCESS_TRAIN, 0.1)
        if params['augment'] == 'both':
            ACCESS_TRAIN = RandomSignalScale(RandomOrientation(ACCESS_TRAIN),
                                             0.1)

    if params['type'] == 'dna_only':
        return (DNA_TRAIN, LABELS_TRAIN), (DNA_VAL, LABELS_VAL), \
               (DNA_TEST, LABELS_TEST)
    elif params['type'] == 'dnase_dna':
        return ([DNA_TRAIN, ACCESS_TRAIN], LABELS_TRAIN), \
                ([DNA_VAL, ACCESS_VAL], LABELS_VAL),\
               ([DNA_TEST, ACCESS_TEST], LABELS_TEST)
    elif params['type'] in ['dnase_bam_only']:
        return ([ACCESS_TRAIN], LABELS_TRAIN), \
               ([ACCESS_VAL], LABELS_VAL), \
               ([ACCESS_TEST], LABELS_TEST)