Exemplo n.º 1
0
def top_level_task():
    params = initialize_parameters()
    args = Struct(**params)
    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    loader = CombinedDataLoader(seed=args.rng_seed)
    print(loader)
    loader.load(
        cache=args.cache,
        ncols=args.feature_subsample,
        agg_dose=args.agg_dose,
        cell_features=args.cell_features,
        drug_features=args.drug_features,
        drug_median_response_min=args.drug_median_response_min,
        drug_median_response_max=args.drug_median_response_max,
        use_landmark_genes=args.use_landmark_genes,
        use_filtered_genes=args.use_filtered_genes,
        cell_feature_subset_path=args.cell_feature_subset_path
        or args.feature_subset_path,
        drug_feature_subset_path=args.drug_feature_subset_path
        or args.feature_subset_path,
        preprocess_rnaseq=args.preprocess_rnaseq,
        single=args.single,
        train_sources=args.train_sources,
        test_sources=args.test_sources,
        embed_feature_source=not args.no_feature_source,
        encode_response_source=not args.no_response_source,
        use_exported_data=args.use_exported_data,
    )

    target = args.agg_dose or 'Growth'
    val_split = args.val_split
    train_split = 1 - val_split

    if args.export_csv:
        fname = args.export_csv
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)

        x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                    dataframe=True,
                                                    single=args.single)
        x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                              dataframe=True,
                                              single=args.single)
        df_train = pd.concat([y_train] + x_train_list, axis=1)
        df_val = pd.concat([y_val] + x_val_list, axis=1)
        df = pd.concat([df_train, df_val]).reset_index(drop=True)
        if args.growth_bins > 1:
            df = uno_data.discretize(df, 'Growth', bins=args.growth_bins)
        df.to_csv(fname, sep='\t', index=False, float_format="%.3g")
        return

    if args.export_data:
        fname = args.export_data
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)
        store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy')

        config_min_itemsize = {'Sample': 30, 'Drug1': 10}
        if not args.single:
            config_min_itemsize['Drug2'] = 10

        for partition in ['train', 'val']:
            gen = train_gen if partition == 'train' else val_gen
            for i in range(gen.steps):
                x_list, y = gen.get_slice(size=args.batch_size,
                                          dataframe=True,
                                          single=args.single)
                for j, input_feature in enumerate(x_list):
                    input_feature.columns = [''] * len(input_feature.columns)
                    store.append('x_{}_{}'.format(partition, j),
                                 input_feature.astype('float32'),
                                 format='table',
                                 data_column=True)
                store.append('y_{}'.format(partition),
                             y.astype({target: 'float32'}),
                             format='table',
                             data_column=True,
                             min_itemsize=config_min_itemsize)
                print('Generating {} dataset. {} / {}'.format(
                    partition, i, gen.steps))

        # save input_features and feature_shapes from loader
        store.put('model', pd.DataFrame())
        store.get_storer('model').attrs.input_features = loader.input_features
        store.get_storer('model').attrs.feature_shapes = loader.feature_shapes

        store.close()
        print('Completed generating {}'.format(fname))
        return

    if args.use_exported_data is None:
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)

    model = build_model(loader, args)
    print(model.summary())
    opt = flexflow.keras.optimizers.SGD()
    model.compile(optimizer=opt,
                  loss='mean_squared_error',
                  metrics=['accuracy', 'mean_squared_error'])

    if args.use_exported_data is not None:
        train_gen = DataFeeder(filename=args.use_exported_data,
                               batch_size=args.batch_size,
                               shuffle=args.shuffle,
                               single=args.single,
                               agg_dose=args.agg_dose)
        val_gen = DataFeeder(partition='val',
                             filename=args.use_exported_data,
                             batch_size=args.batch_size,
                             shuffle=args.shuffle,
                             single=args.single,
                             agg_dose=args.agg_dose)
        test_gen = DataFeeder(partition='test',
                              filename=args.use_exported_data,
                              batch_size=args.batch_size,
                              shuffle=args.shuffle,
                              single=args.single,
                              agg_dose=args.agg_dose)
    else:
        train_gen = CombinedDataGenerator(loader,
                                          fold=fold,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle,
                                          single=args.single)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        fold=fold,
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle,
                                        single=args.single)
        test_gen = CombinedDataGenerator(loader,
                                         partition='test',
                                         fold=fold,
                                         batch_size=args.batch_size,
                                         shuffle=args.shuffle,
                                         single=args.single)

    if args.no_gen:
        x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                    single=args.single)
        model.fit(x_train_list,
                  y_train,
                  batch_size=args.batch_size,
                  epochs=args.epochs)
    else:
        x_train_list, y_train = train_gen.getall()
        x_train_list_np = []
        for x_train in x_train_list:
            x_train_np = np.ascontiguousarray(x_train.to_numpy())
            x_train_list_np.append(x_train_np)
        y_train_np = np.ascontiguousarray(y_train.to_numpy())
        y_train_np = np.reshape(y_train_np, (-1, 1))
        model.fit(x_train_list_np,
                  y_train_np,
                  batch_size=args.batch_size,
                  epochs=args.epochs)
def run(params):
    args = Struct(**params)
    set_seed(args.rng_seed)
    ext = extension_from_parameters(args)
    verify_path(args.save_path)
    prefix = args.save_path + ext
    logfile = args.logfile if args.logfile else prefix + '.log'
    set_up_logger(logfile, args.verbose)
    logger.info('Params: {}'.format(params))

    if (len(args.gpus) > 0):
        import tensorflow as tf
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = ",".join(map(str, args.gpus))
        K.set_session(tf.Session(config=config))

    loader = CombinedDataLoader(seed=args.rng_seed)
    loader.load(
        cache=args.cache,
        ncols=args.feature_subsample,
        agg_dose=args.agg_dose,
        cell_features=args.cell_features,
        drug_features=args.drug_features,
        drug_median_response_min=args.drug_median_response_min,
        drug_median_response_max=args.drug_median_response_max,
        use_landmark_genes=args.use_landmark_genes,
        use_filtered_genes=args.use_filtered_genes,
        cell_feature_subset_path=args.cell_feature_subset_path
        or args.feature_subset_path,
        drug_feature_subset_path=args.drug_feature_subset_path
        or args.feature_subset_path,
        preprocess_rnaseq=args.preprocess_rnaseq,
        single=args.single,
        train_sources=args.train_sources,
        test_sources=args.test_sources,
        embed_feature_source=not args.no_feature_source,
        encode_response_source=not args.no_response_source,
    )

    target = args.agg_dose or 'Growth'
    val_split = args.validation_split
    train_split = 1 - val_split

    if args.export_csv:
        fname = args.export_csv
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)

        x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                    dataframe=True,
                                                    single=args.single)
        x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                              dataframe=True,
                                              single=args.single)
        df_train = pd.concat([y_train] + x_train_list, axis=1)
        df_val = pd.concat([y_val] + x_val_list, axis=1)
        df = pd.concat([df_train, df_val]).reset_index(drop=True)
        if args.growth_bins > 1:
            df = uno_data.discretize(df, 'Growth', bins=args.growth_bins)
        df.to_csv(fname, sep='\t', index=False, float_format="%.3g")
        return

    if args.export_data:
        fname = args.export_data
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)
        store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy')

        config_min_itemsize = {'Sample': 30, 'Drug1': 10}
        if not args.single:
            config_min_itemsize['Drug2'] = 10

        for partition in ['train', 'val']:
            gen = train_gen if partition == 'train' else val_gen
            for i in range(gen.steps):
                x_list, y = gen.get_slice(size=args.batch_size,
                                          dataframe=True,
                                          single=args.single)

                for j, input_feature in enumerate(x_list):
                    input_feature.columns = [''] * len(input_feature.columns)
                    store.append('x_{}_{}'.format(partition, j),
                                 input_feature.astype('float32'),
                                 format='table',
                                 data_column=True)
                store.append('y_{}'.format(partition),
                             y.astype({target: 'float32'}),
                             format='table',
                             data_column=True,
                             min_itemsize=config_min_itemsize)
                logger.info('Generating {} dataset. {} / {}'.format(
                    partition, i, gen.steps))
        store.close()
        logger.info('Completed generating {}'.format(fname))
        return

    loader.partition_data(cv_folds=args.cv,
                          train_split=train_split,
                          val_split=val_split,
                          cell_types=args.cell_types,
                          by_cell=args.by_cell,
                          by_drug=args.by_drug,
                          cell_subset_path=args.cell_subset_path,
                          drug_subset_path=args.drug_subset_path)

    model = build_model(loader, args)
    logger.info('Combined model:')
    model.summary(print_fn=logger.info)
    # plot_model(model, to_file=prefix+'.model.png', show_shapes=True)

    if args.cp:
        model_json = model.to_json()
        with open(prefix + '.model.json', 'w') as f:
            print(model_json, file=f)

    def warmup_scheduler(epoch):
        lr = args.learning_rate or base_lr * args.batch_size / 100
        if epoch <= 5:
            K.set_value(model.optimizer.lr,
                        (base_lr * (5 - epoch) + lr * epoch) / 5)
        logger.debug('Epoch {}: lr={:.5g}'.format(
            epoch, K.get_value(model.optimizer.lr)))
        return K.get_value(model.optimizer.lr)

    df_pred_list = []

    cv_ext = ''
    cv = args.cv if args.cv > 1 else 1

    for fold in range(cv):
        if args.cv > 1:
            logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv))
            cv_ext = '.cv{}'.format(fold + 1)

        template_model = build_model(loader, args, silent=True)
        if args.initial_weights:
            logger.info("Loading weights from {}".format(args.initial_weights))
            template_model.load_weights(args.initial_weights)

        if len(args.gpus) > 1:
            from keras.utils import multi_gpu_model
            gpu_count = len(args.gpus)
            logger.info("Multi GPU with {} gpus".format(gpu_count))
            model = multi_gpu_model(template_model,
                                    cpu_merge=False,
                                    gpus=gpu_count)
        else:
            model = template_model

        optimizer = optimizers.deserialize({
            'class_name': args.optimizer,
            'config': {}
        })
        base_lr = args.base_lr or K.get_value(optimizer.lr)
        if args.learning_rate:
            K.set_value(optimizer.lr, args.learning_rate)

        model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2])

        # calculate trainable and non-trainable params
        params.update(candle.compute_trainable_params(model))

        candle_monitor = candle.CandleRemoteMonitor(params=params)
        timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])

        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.5,
                                      patience=5,
                                      min_lr=0.00001)
        warmup_lr = LearningRateScheduler(warmup_scheduler)
        checkpointer = MultiGPUCheckpoint(prefix + cv_ext + '.model.h5',
                                          save_best_only=True)
        tensorboard = TensorBoard(
            log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext))
        history_logger = LoggingCallback(logger.debug)

        callbacks = [candle_monitor, timeout_monitor, history_logger]
        if args.reduce_lr:
            callbacks.append(reduce_lr)
        if args.warmup_lr:
            callbacks.append(warmup_lr)
        if args.cp:
            callbacks.append(checkpointer)
        if args.tb:
            callbacks.append(tensorboard)
        if args.save_weights:
            callbacks.append(
                SimpleWeightSaver(args.save_path + '/' + args.save_weights))

        if args.use_exported_data is not None:
            train_gen = DataFeeder(filename=args.use_exported_data,
                                   batch_size=args.batch_size,
                                   shuffle=args.shuffle,
                                   single=args.single,
                                   agg_dose=args.agg_dose)
            val_gen = DataFeeder(partition='val',
                                 filename=args.use_exported_data,
                                 batch_size=args.batch_size,
                                 shuffle=args.shuffle,
                                 single=args.single,
                                 agg_dose=args.agg_dose)
        else:
            train_gen = CombinedDataGenerator(loader,
                                              fold=fold,
                                              batch_size=args.batch_size,
                                              shuffle=args.shuffle,
                                              single=args.single)
            val_gen = CombinedDataGenerator(loader,
                                            partition='val',
                                            fold=fold,
                                            batch_size=args.batch_size,
                                            shuffle=args.shuffle,
                                            single=args.single)

        df_val = val_gen.get_response(copy=True)
        y_val = df_val[target].values
        y_shuf = np.random.permutation(y_val)
        log_evaluation(evaluate_prediction(y_val, y_shuf),
                       description='Between random pairs in y_val:')

        if args.no_gen:
            x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                        single=args.single)
            x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                                  single=args.single)
            history = model.fit(x_train_list,
                                y_train,
                                batch_size=args.batch_size,
                                epochs=args.epochs,
                                callbacks=callbacks,
                                validation_data=(x_val_list, y_val))
        else:
            logger.info('Data points per epoch: train = %d, val = %d',
                        train_gen.size, val_gen.size)
            logger.info('Steps per epoch: train = %d, val = %d',
                        train_gen.steps, val_gen.steps)
            history = model.fit_generator(train_gen,
                                          train_gen.steps,
                                          epochs=args.epochs,
                                          callbacks=callbacks,
                                          validation_data=val_gen,
                                          validation_steps=val_gen.steps)

        if args.no_gen:
            y_val_pred = model.predict(x_val_list, batch_size=args.batch_size)
        else:
            val_gen.reset()
            y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1)
            y_val_pred = y_val_pred[:val_gen.size]

        y_val_pred = y_val_pred.flatten()

        scores = evaluate_prediction(y_val, y_val_pred)
        log_evaluation(scores)

        # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred - y_val)
        df_val['Predicted' + target] = y_val_pred
        df_val[target + 'Error'] = y_val_pred - y_val
        df_pred_list.append(df_val)

        if hasattr(history, 'loss'):
            plot_history(prefix, history, 'loss')
        if hasattr(history, 'r2'):
            plot_history(prefix, history, 'r2')

    pred_fname = prefix + '.predicted.tsv'
    df_pred = pd.concat(df_pred_list)
    if args.agg_dose:
        if args.single:
            df_pred.sort_values(['Sample', 'Drug1', target], inplace=True)
        else:
            df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target],
                                inplace=True)
    else:
        if args.single:
            df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'],
                                inplace=True)
        else:
            df_pred.sort_values(
                ['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'],
                inplace=True)
    df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')

    if args.cv > 1:
        scores = evaluate_prediction(df_pred[target],
                                     df_pred['Predicted' + target])
        log_evaluation(scores, description='Combining cross validation folds:')

    for test_source in loader.test_sep_sources:
        test_gen = CombinedDataGenerator(loader,
                                         partition='test',
                                         batch_size=args.batch_size,
                                         source=test_source)
        df_test = test_gen.get_response(copy=True)
        y_test = df_test[target].values
        n_test = len(y_test)
        if n_test == 0:
            continue
        if args.no_gen:
            x_test_list, y_test = test_gen.get_slice(size=test_gen.size,
                                                     single=args.single)
            y_test_pred = model.predict(x_test_list,
                                        batch_size=args.batch_size)
        else:
            y_test_pred = model.predict_generator(
                test_gen.flow(single=args.single), test_gen.steps)
            y_test_pred = y_test_pred[:test_gen.size]
        y_test_pred = y_test_pred.flatten()
        scores = evaluate_prediction(y_test, y_test_pred)
        log_evaluation(scores,
                       description='Testing on data from {} ({})'.format(
                           test_source, n_test))

    if K.backend() == 'tensorflow':
        K.clear_session()

    logger.handlers = []

    return history
Exemplo n.º 3
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    candle.register_permanent_dropout()
    if args.model_file.split('.')[-1] == 'json':
        with open(args.model_file, 'r') as model_file:
            model_json = model_file.read()
            model = keras.models.model_from_json(model_json)
            model.load_weights(args.weights_file)
    else:
        model = keras.models.load_model(args.model_file)

    model.summary()

    cv_pred_list = []
    cv_y_list = []
    df_pred_list = []
    cv_stats = {'mae': [], 'mse': [], 'r2': [], 'corr': []}
    target = args.agg_dose or 'Growth'

    for cv in range(args.n_pred):
        cv_pred = []
        dataset = ['train', 'val'
                   ] if args.partition == 'all' else [args.partition]
        for partition in dataset:
            test_gen = DataFeeder(filename=args.data,
                                  partition=partition,
                                  batch_size=1024,
                                  single=args.single,
                                  agg_dose=args.agg_dose)
            y_test_pred = model.predict_generator(test_gen, test_gen.steps + 1)
            y_test_pred = y_test_pred[:test_gen.size]
            y_test_pred = y_test_pred.flatten()

            df_y = test_gen.get_response(copy=True)
            y_test = df_y[target].values

            df_pred = df_y.assign(
                **{
                    f'Predicted{target}': y_test_pred,
                    f'{target}Error': y_test_pred - y_test
                })
            df_pred_list.append(df_pred)
            test_gen.close()

            if cv == 0:
                cv_y_list.append(df_y)
            cv_pred.append(y_test_pred)
        cv_pred_list.append(np.concatenate(cv_pred))

        # calcuate stats for mse, mae, r2, corr
        scores = evaluate_prediction(df_pred[target],
                                     df_pred[f'Predicted{target}'])
        # log_evaluation(scores, description=cv)
        [cv_stats[key].append(scores[key]) for key in scores.keys()]

    df_pred = pd.concat(df_pred_list)
    cv_y = pd.concat(cv_y_list)

    # save to tsv
    headers = ['Sample', 'Drug1']
    if not args.single: headers.append('Drug2')
    if not args.agg_dose: headers.append('Dose1')
    if not args.single and not args.agg_dose: headers.append('Dose2')
    headers.append(target)

    df_pred.sort_values(headers, inplace=True)
    df_pred.to_csv('uno_pred.all.tsv',
                   sep='\t',
                   index=False,
                   float_format='%.6g')

    df_sum = cv_y.assign()
    df_sum[f'Pred{target}Mean'] = np.mean(cv_pred_list, axis=0)
    df_sum[f'Pred{target}Std'] = np.std(cv_pred_list, axis=0)
    df_sum[f'Pred{target}Min'] = np.min(cv_pred_list, axis=0)
    df_sum[f'Pred{target}Max'] = np.max(cv_pred_list, axis=0)

    df_sum.to_csv('uno_pred.tsv', index=False, sep='\t', float_format='%.6g')

    scores = evaluate_prediction(df_pred[f'{target}'],
                                 df_pred[f'Predicted{target}'])
    log_evaluation(
        scores,
        description='Testing on data from {} on partition {} ({} rows)'.format(
            args.data, args.partition, len(cv_y)))

    print('     mean    std    min    max')
    for key in ['mse', 'mae', 'r2', 'corr']:
        print('{}: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(
            key, np.around(np.mean(cv_stats[key], axis=0), decimals=4),
            np.around(np.std(cv_stats[key], axis=0), decimals=4),
            np.around(np.min(cv_stats[key], axis=0), decimals=4),
            np.around(np.max(cv_stats[key], axis=0), decimals=4)))