Exemplo n.º 1
0
def run_kxsmfb_experiment(tgt_gis, src_gis, sim_scores, L_tgt, L_src,
                        space, val_hf, test_hf,
                        n_repeats, hp_iters, hp_seed):
    all_results = []
    all_params  = []
    all_models = []
    log = get_logger()
    param_search_training_curves = []
    hp_trials = []

    src_X_scaled = MCScaler(mode='std').fit_transform(src_gis['values'])
    for i in range(n_repeats):
        log.info('[Outer fold: %i]' % i)
        scaler = MCScaler(mode='std')
        X_train, X_test, eval_mask = gi_train_test_split(tgt_gis, test_hf)
        X_train = scaler.fit_transform(X_train)
        X_train_all = X_train.copy()
        
        tgt_gis['values'] = X_train
        log.info('- Holding out %.3f fraction of data for validation' % val_hf)
        X_train, X_val, _ = gi_train_test_split(tgt_gis, val_hf)
        
        log.info('- Performing hyperparameter search for %i iterations' % hp_iters)
        trials = hyperopt.Trials()
        # NB: Ignore the returned hyperopt parameters, we want to know which parameters were
        # used even if they were default values for keyword arguments
        _ = hyperopt.fmin(fn=get_kxsmfb_obj(X_train, X_val, src_X_scaled, sim_scores, L_tgt, L_src), 
                        space=space, 
                        algo=hyperopt.tpe.suggest,
                        max_evals = hp_iters,
                        trials=trials,
                        show_progressbar=True,
                        rstate=np.random.RandomState(hp_seed))
        # NB: random state of hyperopt cannot be set globally, so we pass a 
        # np.RandomState object for reproducibility...
        hp_seed += 1
        best_trial = trials.best_trial['result']

        # NB: that the parameter dictionary in trial['params'] is specified *explicitly* 
        # the retraining of new models then use *no optional arguments*. 
        # This makes reporting and retraining easy and unambiguous
        best_params = best_trial['params']

        # TODO:
        # # It's too fussy to serialize the models and save them as attachments in hyperopt.
        # # Instead, we retrain a model and compute training curves instead.
        # log.info('- Retraining model with validation data to get training curve')
        # training_curve = compute_training_curve(retrain_model, X_train, X_val, best_params, 
        #                                         fit_params=fit_params)
        # param_search_training_curves.append(training_curve)

        # Retrain model using the number of iterations and parameters found in hp search
        log.info('- Retraining model without validation to get best model')
        best_model = KXSMF_b(X_tgt=X_train_all, X_val=None, X_src=src_X_scaled,  
                            sim_scores=sim_scores, L_tgt=L_tgt, L_src=L_src,
                            **best_params)
       
        best_model.fit()

        # Make predictions and evaluate the model
        X_fitted = best_model.X_fitted
        X_fitted = scaler.inverse_transform(X_fitted)

        if len(tgt_gis['rows']) == len(tgt_gis['cols']) and np.all(tgt_gis['rows'] == tgt_gis['cols']):
            log.info('* Averaging over pairs because input is symmetric')
            X_fitted = (X_fitted.T + X_fitted) / 2.
        
        # test_mask = ~np.isnan(X_test)

        # test_mask[np.tril_indices(len(test_mask))] = False

        results = evaluate_model(X_test[eval_mask], X_fitted[eval_mask])
        log.info('[Results for fold %i]' % i)
        log.info('- Best params for model')
        log_dict(log.info, best_params)
        log.info('- Results:')
        log_dict(log.info, results)
        
        hp_trials.append(trials.results)
        all_results.append(results)
        all_params.append(best_params)
        all_models.append(best_model)
        
    # Collate the results and return
    summarized, collected = summarize_results(all_results)
    return dict(summary=summarized, 
                fold_results=collected, 
                best_params=all_params), \
           all_models, \
           param_search_training_curves, \
           hp_trials
Exemplo n.º 2
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()
    assert( 0 <= args.hidden_fraction <= 1 )
    
    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    args = parse_args()
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))

    log.info('[Loading input data]')

    with open(args.target_gis, 'rb') as f:
        gi_data = cpkl.load(f)

    row_genes = gi_data['rows']

    log.info('\t- setting up training and test sets')
    train_test_sets = [gi_train_test_split(gi_data, args.hidden_fraction) for _ in range(args.n_repeats)]
    
    train_Xs, test_Xs, test_masks= zip(*train_test_sets)
    if args.mc_alg == 'NGMC':
        scalers = [MCScaler('0-1') for _ in range(args.n_repeats)]
    else:
        scalers = [MCScaler('std') for _ in range(args.n_repeats)]

    train_Xs = [scaler.fit_transform(X) for scaler, X in zip(scalers, train_Xs)]

    if args.mc_alg == 'PMF':
        imputed_Xs, models_info = train_pmf_models(train_Xs = train_Xs,
                                                   rank = args.rank,
                                                   iters = args.iters,
                                                   lr = args.lr,
                                                   lam = args.lambda_f,
                                                   report_every = args.report_every)
    elif args.mc_alg == 'PMF_b':
        imputed_Xs, models_info = train_pmf_b_models(train_Xs = train_Xs,
                                                   rank = args.rank,
                                                   iters = args.iters,
                                                   lr = args.lr,
                                                   lam = args.lambda_f,
                                                   lam_b = args.lambda_b,
                                                   report_every = args.report_every)
    elif args.mc_alg == 'KPMF':
        L = get_laplacian(list(row_genes), args.target_ppi)
        imputed_Xs, models_info = train_kpmf_models(train_Xs = train_Xs,
                                                    L = L,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    rl_lambda = args.rl_lambda,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KPMF_b':
        L = get_laplacian(list(row_genes), args.target_ppi)
        imputed_Xs, models_info = train_kpmf_b_models(train_Xs = train_Xs,
                                                    L = L,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_b = args.lambda_b,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    rl_lambda = args.rl_lambda,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'NGMC':
        ppi = nx.read_edgelist(args.target_ppi)
        A = get_ppi_data(list(row_genes), ppi, mode='normalized_adjacency')
        imputed_Xs, models_info = train_ngmc_models(train_Xs = train_Xs,
                                                    A = A,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    alpha_p = args.alpha_p,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    lambda_p = args.lambda_p)
    elif args.mc_alg == 'XSMF':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        imputed_Xs, models_info = train_xsmf_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KXSMF':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi)
        L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi)
        log.warn('%s, %s' % L_src.shape)
        log.warn('%s, %s' % X_src.shape)

        imputed_Xs, models_info = train_kxsmf_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    L_tgt=L_tgt,
                                                    L_src=L_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    lambda_tgt_rl = args.lambda_tgt_rl,
                                                    lambda_src_rl = args.lambda_src_rl,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KXSMF_b':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi)
        L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi)
        log.warn('%s, %s' % L_src.shape)
        log.warn('%s, %s' % X_src.shape)

        imputed_Xs, models_info = train_kxsmfb_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    L_tgt=L_tgt,
                                                    L_src=L_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_b= args.lambda_b,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    lambda_tgt_rl = args.lambda_tgt_rl,
                                                    lambda_src_rl = args.lambda_src_rl,
                                                    report_every = args.report_every)
    else:
        raise NotImplementedError
    
    imputed_Xs = [scaler.inverse_transform(X) for scaler, X in zip(scalers, imputed_Xs)] # Take transposes here for XSMF, KXSMF

    results = evaluate_preds(test_Xs, imputed_Xs, test_masks)
    results, fold_results = summarize_results(results)
    log_results(results)

    results_dict = dict(summary=results, collected=fold_results, args=vars(args))

    pvals_data = None
    if args.pval_file:
        # given pval file
        with open(args.pval_file, 'rb') as f:
            pvals_data = cpkl.load(f)
        assert(np.all(pvals_data['cols'] == gi_data['cols']))
        assert(np.all(pvals_data['rows'] == gi_data['rows']))

        pvals = pvals_data['values']
        pvals_filled = np.where(np.isnan(pvals), 1000, pvals)
        sig_mask = pvals_filled < args.pval_thresh

        sig_test_Xs = [np.where(sig_mask, _X, np.nan) for _X in test_Xs]
        sig_imputed_Xs = [np.where(sig_mask, _X, np.nan) for _X in imputed_Xs]

        sig_results = evaluate_preds(sig_test_Xs, sig_imputed_Xs, test_masks)
        sig_results, sig_fold_results = summarize_results(sig_results)
        log_results(sig_results)

        results_dict['sig_summary'] = sig_results
        results_dict['sig_collected'] = sig_fold_results

    with open(args.results_output, 'w') as f:
        json.dump(results_dict, f, indent=2)

    serialized_data = {
        'GIs': gi_data,
        'alg': args.mc_alg,
        'fold_data': dict(train_Xs=train_Xs, test_Xs=test_Xs, masks=test_masks),
        'imputed_Xs': imputed_Xs,
        'models_info': models_info,
        'pvals': pvals_data
    }

    with open(args.models_output, 'wb') as f:
        cpkl.dump(serialized_data, f)
Exemplo n.º 3
0
def run_mc_alg(gis,
               fold_objective,
               retrain_model,
               space,
               scaler,
               val_hidden_fraction,
               hidden_fraction,
               fit_params=None,
               train_with_validation=False,
               n_repeats=1,
               hyperopt_iters=3,
               hyperopt_seed=None):

    all_results = []
    all_params = []
    all_models = []
    log = get_logger()
    param_search_training_curves = []
    hp_trials = []

    for i in range(n_repeats):
        log.info('[Outer fold: %i]' % i)
        X_train, X_test, test_mask = gi_train_test_split(gis, hidden_fraction)
        X_train = scaler.fit_transform(X_train)
        X_train_all = X_train.copy()
        if train_with_validation:
            gis['values'] = X_train_all
            log.info('- Holding out %f fraction of data for validation' %
                     val_hidden_fraction)
            X_train, X_val, _ = gi_train_test_split(gis, val_hidden_fraction)
        log.info('- Performing hyperparameter search for %i iterations' %
                 hyperopt_iters)
        trials = hyperopt.Trials()

        # NB: Ignore the returned hyperopt parameters, we want to know which parameters were
        # used even if they were default values for keyword arguments
        _ = hyperopt.fmin(fn=fold_objective(X_train,
                                            X_val,
                                            fit_params=fit_params),
                          space=space,
                          algo=hyperopt.tpe.suggest,
                          max_evals=hyperopt_iters,
                          trials=trials,
                          show_progressbar=True,
                          rstate=np.random.RandomState(hyperopt_seed))

        # NB: random state of hyperopt cannot be set globally, so we pass a
        # np.RandomState object for reproducibility...
        hyperopt_seed += 1
        best_trial = trials.best_trial['result']

        # NB: that the parameter dictionary in trial['params'] is specified *explicitly*
        # the retraining of new models then use *no optional arguments*.
        # This makes reporting and retraining easy and unambiguous
        best_params = best_trial['params']

        # It's too fussy to serialize the models and save them as attachments in hyperopt.
        # Instead, we retrain a model and compute training curves instead.
        log.info(
            '- Retraining model with validation data to get training curve')
        training_curve = compute_training_curve(retrain_model,
                                                X_train,
                                                X_val,
                                                best_params,
                                                fit_params=fit_params)
        param_search_training_curves.append(training_curve)

        # Retrain model using the number of iterations and parameters found in hp search
        log.info('- Retraining model without validation to get best model')
        best_model = retrain_model(X_train_all,
                                   best_params,
                                   fit_params=fit_params)

        # Make predictions and evaluate the model
        X_fitted = best_model.X_fitted
        X_fitted = scaler.inverse_transform(X_fitted)

        if X_train.shape == X_train.T.shape and np.allclose(
                X_train, X_train.T, equal_nan=True):
            log.info('- Data was square, averaging predictions...')
            X_fitted = (X_fitted.T + X_fitted) / 2.

        results = evaluate_model(X_test[test_mask], X_fitted[test_mask])
        log.info('[Results for fold %i]' % i)
        log.info('- Best params for model')
        log_dict(log.info, best_params)
        log.info('- Results:')
        log_dict(log.info, results)

        hp_trials.append(trials.results)
        all_results.append(results)
        all_params.append(best_params)
        all_models.append(best_model)

    # Collate the results and return
    summarized, collected = summarize_results(all_results)
    return dict(summary=summarized,
                fold_results=collected,
                best_params=all_params), \
           all_models, \
           param_search_training_curves, \
           hp_trials