예제 #1
0
파일: test_lin_reg.py 프로젝트: bcajes/glow
def run_linear_regression(genotype_df,
                          phenotype_df,
                          covariate_df,
                          add_intercept=True):
    phenotype_names = phenotype_df.columns.astype('str').to_series()
    C = covariate_df.to_numpy('float64', copy=True)
    if add_intercept:
        C = gwas_fx._add_intercept(C, genotype_df.shape[0])
    if not C.size:
        C = np.zeros((genotype_df.shape[0], 1))
    Y = phenotype_df.to_numpy('float64', copy=True)
    Y_mask = ~np.isnan(Y)
    Y[~Y_mask] = 0
    Y -= Y.mean(axis=0)
    Q = np.linalg.qr(C)[0]
    Y = gwas_fx._residualize_in_place(Y, Q) * Y_mask
    Y_scale = np.ones(Y.shape[1])
    Y_state = lr._create_YState(Y, phenotype_df, pd.DataFrame({}), Y_mask,
                                np.float64, None)
    dof = C.shape[0] - C.shape[1] - 1
    pdf = pd.DataFrame(
        {lr._VALUES_COLUMN_NAME: list(genotype_df.to_numpy('float64').T)})

    return lr._linear_regression_inner(pdf, Y_state, Y_mask.astype('float64'),
                                       Y_scale, Q, dof, phenotype_names, None,
                                       None)
예제 #2
0
def run_score_test(genotype_df,
                   phenotype_df,
                   covariate_df,
                   correction=lr.correction_none,
                   add_intercept=True):
    C = covariate_df.to_numpy(copy=True)
    if add_intercept:
        C = gwas_fx._add_intercept(C, phenotype_df.shape[0])
    Y = phenotype_df.to_numpy(copy=True)
    Y_mask = ~np.isnan(Y)
    Y[~Y_mask] = 0
    state_rows = [
        lr._prepare_one_phenotype(
            C, pd.Series({
                'label': p,
                'values': phenotype_df[p]
            }), correction, add_intercept) for p in phenotype_df
    ]
    phenotype_names = phenotype_df.columns.to_series().astype('str')
    state = lr._pdf_to_log_reg_state(pd.DataFrame(state_rows), phenotype_names,
                                     C.shape[1])
    values_df = pd.DataFrame(
        {gwas_fx._VALUES_COLUMN_NAME: list(genotype_df.to_numpy().T)})
    return lr._logistic_regression_inner(
        values_df, state, C, Y, Y_mask, None, lr.correction_none, 0.05,
        phenotype_df.columns.to_series().astype('str'))
예제 #3
0
파일: test_lin_reg.py 프로젝트: bcajes/glow
def statsmodels_baseline(genotype_df,
                         phenotype_df,
                         covariate_df,
                         offset_dfs=None,
                         add_intercept=True):
    # Project out covariates from genotypes and phenotypes
    C = covariate_df.to_numpy('float64')
    num_samples = C.shape[0] if C.size else genotype_df.shape[0]
    if add_intercept:
        C = gwas_fx._add_intercept(C, num_samples)
    Y = phenotype_df.to_numpy('float64')
    X = genotype_df.to_numpy('float64')
    phenotype_df.columns = phenotype_df.columns.astype('str')
    dof = C.shape[0] - C.shape[1] - 1
    effects = []
    errors = []
    tvalues = []
    pvalues = []
    for phenotype_idx in range(Y.shape[1]):
        for genotype_idx in range(X.shape[1]):
            phenotype = Y[:, phenotype_idx].copy()
            phenotype_mask = ~np.isnan(phenotype)
            phenotype[~phenotype_mask] = 0
            phenotype -= phenotype.mean()
            phenotype = residualize(phenotype, C) * phenotype_mask
            phenotype_scale = np.sqrt(
                (phenotype**2).sum() / (phenotype_mask.sum() - C.shape[1]))
            phenotype /= phenotype_scale
            if offset_dfs:
                offset = offset_dfs[genotype_idx].iloc[:,
                                                       phenotype_idx].to_numpy(
                                                           'float64')
                phenotype = phenotype - offset
            phenotype[~phenotype_mask] = np.nan

            genotype = residualize(X[:, genotype_idx], C)
            genotype = pd.Series(genotype, name='genotype')

            model = sm.OLS(phenotype, genotype, missing='drop')
            model.df_resid = dof
            results = model.fit()

            effects.append(results.params.genotype * phenotype_scale)
            errors.append(results.bse.genotype * phenotype_scale)
            tvalues.append(results.tvalues.genotype)
            pvalues.append(results.pvalues.genotype)
    return pd.DataFrame({
        'effect':
        effects,
        'standard_error':
        errors,
        'tvalue':
        tvalues,
        'pvalue':
        pvalues,
        'phenotype':
        phenotype_df.columns.to_series().repeat(genotype_df.shape[1])
    })