Exemplo n.º 1
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()

    assert (0 <= args.hidden_fraction <= 1)

    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))
    log.info('[Loading target GIs]')
    with open(args.target_gis, 'rb') as f:
        tgt_gis = cpkl.load(f)

    log.info('[Loading source GIs]')
    with open(args.source_gis, 'rb') as f:
        src_gis = cpkl.load(f)

    log.info('[Loading sim scores]')
    with open(args.sim_scores, 'rb') as f:
        sim_scores_data = cpkl.load(f)
    sim_scores = sim_scores_data['values']
    sim_scores = sim_scores / np.max(sim_scores)  # Normalize

    # log.info('\t- %d scores', len(sim_scores))

    hp_param_space = xsmf_param_space(args)

    results, models, training_curves, trials = \
        run_xsmf_experiment(tgt_gis=tgt_gis,
                            src_gis=src_gis,
                            space=hp_param_space,
                            sim_scores=sim_scores,
                            val_hf=args.val_hidden_fraction,
                            test_hf=args.hidden_fraction,
                            n_repeats=args.n_repeats,
                            hp_iters=args.n_hyperopt_iters,
                            hp_seed=args.random_seed)
    # Save results and other information
    log_results(results['summary'])
    with open(args.results_output, 'w') as f:
        json.dump(results, f, indent=2)

    with open(args.training_curve_output, 'wb') as f:
        cpkl.dump(training_curves, f)

    # TODO: save models the models cannot be pickled at the moment
    # We will need to implement a from dict and a to dict method
    with open(args.models_output, 'wb') as f:
        cpkl.dump(trials, f)

    with open(args.trials_output, 'wb') as f:
        cpkl.dump(trials, f)
Exemplo n.º 2
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()
    assert( 0 <= args.hidden_fraction <= 1 )
    
    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    args = parse_args()
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))

    log.info('[Loading input data]')

    with open(args.target_gis, 'rb') as f:
        gi_data = cpkl.load(f)

    row_genes = gi_data['rows']

    log.info('\t- setting up training and test sets')
    train_test_sets = [gi_train_test_split(gi_data, args.hidden_fraction) for _ in range(args.n_repeats)]
    
    train_Xs, test_Xs, test_masks= zip(*train_test_sets)
    if args.mc_alg == 'NGMC':
        scalers = [MCScaler('0-1') for _ in range(args.n_repeats)]
    else:
        scalers = [MCScaler('std') for _ in range(args.n_repeats)]

    train_Xs = [scaler.fit_transform(X) for scaler, X in zip(scalers, train_Xs)]

    if args.mc_alg == 'PMF':
        imputed_Xs, models_info = train_pmf_models(train_Xs = train_Xs,
                                                   rank = args.rank,
                                                   iters = args.iters,
                                                   lr = args.lr,
                                                   lam = args.lambda_f,
                                                   report_every = args.report_every)
    elif args.mc_alg == 'PMF_b':
        imputed_Xs, models_info = train_pmf_b_models(train_Xs = train_Xs,
                                                   rank = args.rank,
                                                   iters = args.iters,
                                                   lr = args.lr,
                                                   lam = args.lambda_f,
                                                   lam_b = args.lambda_b,
                                                   report_every = args.report_every)
    elif args.mc_alg == 'KPMF':
        L = get_laplacian(list(row_genes), args.target_ppi)
        imputed_Xs, models_info = train_kpmf_models(train_Xs = train_Xs,
                                                    L = L,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    rl_lambda = args.rl_lambda,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KPMF_b':
        L = get_laplacian(list(row_genes), args.target_ppi)
        imputed_Xs, models_info = train_kpmf_b_models(train_Xs = train_Xs,
                                                    L = L,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_b = args.lambda_b,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    rl_lambda = args.rl_lambda,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'NGMC':
        ppi = nx.read_edgelist(args.target_ppi)
        A = get_ppi_data(list(row_genes), ppi, mode='normalized_adjacency')
        imputed_Xs, models_info = train_ngmc_models(train_Xs = train_Xs,
                                                    A = A,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    alpha_p = args.alpha_p,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    lambda_p = args.lambda_p)
    elif args.mc_alg == 'XSMF':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        imputed_Xs, models_info = train_xsmf_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KXSMF':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi)
        L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi)
        log.warn('%s, %s' % L_src.shape)
        log.warn('%s, %s' % X_src.shape)

        imputed_Xs, models_info = train_kxsmf_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    L_tgt=L_tgt,
                                                    L_src=L_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    lambda_tgt_rl = args.lambda_tgt_rl,
                                                    lambda_src_rl = args.lambda_src_rl,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KXSMF_b':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi)
        L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi)
        log.warn('%s, %s' % L_src.shape)
        log.warn('%s, %s' % X_src.shape)

        imputed_Xs, models_info = train_kxsmfb_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    L_tgt=L_tgt,
                                                    L_src=L_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_b= args.lambda_b,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    lambda_tgt_rl = args.lambda_tgt_rl,
                                                    lambda_src_rl = args.lambda_src_rl,
                                                    report_every = args.report_every)
    else:
        raise NotImplementedError
    
    imputed_Xs = [scaler.inverse_transform(X) for scaler, X in zip(scalers, imputed_Xs)] # Take transposes here for XSMF, KXSMF

    results = evaluate_preds(test_Xs, imputed_Xs, test_masks)
    results, fold_results = summarize_results(results)
    log_results(results)

    results_dict = dict(summary=results, collected=fold_results, args=vars(args))

    pvals_data = None
    if args.pval_file:
        # given pval file
        with open(args.pval_file, 'rb') as f:
            pvals_data = cpkl.load(f)
        assert(np.all(pvals_data['cols'] == gi_data['cols']))
        assert(np.all(pvals_data['rows'] == gi_data['rows']))

        pvals = pvals_data['values']
        pvals_filled = np.where(np.isnan(pvals), 1000, pvals)
        sig_mask = pvals_filled < args.pval_thresh

        sig_test_Xs = [np.where(sig_mask, _X, np.nan) for _X in test_Xs]
        sig_imputed_Xs = [np.where(sig_mask, _X, np.nan) for _X in imputed_Xs]

        sig_results = evaluate_preds(sig_test_Xs, sig_imputed_Xs, test_masks)
        sig_results, sig_fold_results = summarize_results(sig_results)
        log_results(sig_results)

        results_dict['sig_summary'] = sig_results
        results_dict['sig_collected'] = sig_fold_results

    with open(args.results_output, 'w') as f:
        json.dump(results_dict, f, indent=2)

    serialized_data = {
        'GIs': gi_data,
        'alg': args.mc_alg,
        'fold_data': dict(train_Xs=train_Xs, test_Xs=test_Xs, masks=test_masks),
        'imputed_Xs': imputed_Xs,
        'models_info': models_info,
        'pvals': pvals_data
    }

    with open(args.models_output, 'wb') as f:
        cpkl.dump(serialized_data, f)
Exemplo n.º 3
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()

    assert (0 <= args.hidden_fraction <= 1)

    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))
    log.info('[Loading input data]')

    with open(args.input_file, 'rb') as f:
        obj = cpkl.load(f)

    # Set up experiments
    fit_params = None
    if args.mc_alg == 'PMF':
        param_space = pmf_param_space(args)
        run_experiment = run_pmf
    elif args.mc_alg == 'PMF_b':
        param_space = pmfb_param_space(args)
        run_experiment = run_pmfb
    elif args.mc_alg in ['KPMF', 'NGMC', 'KPMF_b']:
        # Experiments that need PPI network
        if args.ppi is not None:
            ppi = nx.read_edgelist(args.ppi)

        if args.mc_alg == 'KPMF':
            L = get_ppi_data(obj['rows'], ppi, mode='laplacian')
            param_space = kpmf_param_space(args)
            run_experiment = run_kpmf
            fit_params = dict(L=L)
        elif args.mc_alg == 'KPMF_b':
            L = get_ppi_data(obj['rows'], ppi, mode='laplacian')
            param_space = kpmfb_param_space(args)
            run_experiment = run_kpmfb
            fit_params = dict(L=L)
        elif args.mc_alg == 'NGMC':
            fit_params = dict(P=None)
            P = get_ppi_data(obj['rows'], ppi, mode='normalized_adjacency')
            fit_params['P'] = P
            param_space = ngmc_param_space(args)
            run_experiment = run_ngmc
        else:
            raise (NotImplementedError(
                '{} option is invalid or not implemented'.format(args.mc_alg)))

    else:
        raise (NotImplementedError(
            '{} option is invalid or not implemented'.format(args.mc_alg)))

    # Run experimental protocol
    results, models, training_curves, trials = \
        run_experiment(obj,
                        param_space = param_space,
                        fit_params = fit_params,
                        val_hidden_fraction=args.val_hidden_fraction,
                        hidden_fraction=args.hidden_fraction,
                        n_repeats=args.n_repeats,
                        hyperopt_iters=args.n_hyperopt_iters,
                        seed=args.random_seed,
                        logistic=args.logistic)

    # Save results and other information
    log_results(results['summary'])
    with open(args.results_output, 'w') as f:
        json.dump(results, f, indent=2)

    with open(args.training_curve_output, 'wb') as f:
        cpkl.dump(training_curves, f)

    # TODO: save models the models cannot be pickled at the moment
    # We will need to implement a from dict and a to dict method
    with open(args.models_output, 'wb') as f:
        cpkl.dump(trials, f)

    with open(args.trials_output, 'wb') as f:
        cpkl.dump(trials, f)