예제 #1
0
def test_regression_transform_validates_inputs(spark):
    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level1df = spark.read.parquet(
        f'{data_root}/level1BlockedGT.snappy.parquet').limit(5)

    group2ids = __get_sample_blocks(indexdf)
    regressor = RidgeRegression(alphas)
    model1df, cvdf = regressor.fit(level1df, labeldf, group2ids, covdf)

    with pytest.raises(ValueError):
        regressor.transform(level1df, label_with_missing, group2ids, model1df,
                            cvdf, covdf)
    with pytest.raises(ValueError):
        regressor.transform(level1df, labeldf, group2ids, model1df, cvdf,
                            covdf_with_missing)
    with pytest.warns(UserWarning):
        regressor.transform(level1df, labeldf + 0.5, group2ids, model1df, cvdf,
                            covdf)
    with pytest.warns(UserWarning):
        regressor.transform(level1df, labeldf * 1.5, group2ids, model1df, cvdf,
                            covdf)
    with pytest.warns(UserWarning):
        regressor.transform(level1df, labeldf, group2ids, model1df, cvdf,
                            covdf + 0.5)
    with pytest.warns(UserWarning):
        regressor.transform(level1df, labeldf, group2ids, model1df, cvdf,
                            covdf * 1.5)
    # Should issue no warnings
    regressor.transform(level1df, labeldf, group2ids, model1df, cvdf, covdf)
예제 #2
0
def test_tie_break(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet').limit(5)
    group2ids = __get_sample_blocks(indexdf)

    regressor = RidgeRegression(np.array([0.1, 0.2, 0.1, 0.2]))
    _, cvdf = regressor.fit(level1df, labeldf, group2ids)

    assert cvdf.count() == len(labeldf.columns)
예제 #3
0
def test_two_level_regression_transform_loco_infer_contigs(spark):
    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level2df = spark.read.parquet(f'{data_root}/level2BlockedGT.snappy.parquet')

    group2ids = __get_sample_blocks(indexdf)
    regressor = RidgeRegression(alphas)
    model1df, cvdf = regressor.fit(level2df, labeldf, group2ids)
    y_hat = regressor.transform_loco(level2df, labeldf, group2ids, model1df, cvdf)

    pd.testing.assert_frame_equal(y_hat, level2_yhat_loco_df)
예제 #4
0
def test_regression_generate_alphas(spark):
    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet')

    group2ids = __get_sample_blocks(indexdf)
    regressor_fit = RidgeRegression()
    regressor_transform = RidgeRegression()

    model1df, cvdf = regressor_fit.fit(level1df, labeldf, group2ids)
    with pytest.raises(Exception):
        y_hat = regressor_transform.transform(level1df, labeldf, group2ids, model1df, cvdf)
예제 #5
0
def test_regression_generate_alphas(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet').limit(5)
    group2ids = __get_sample_blocks(indexdf)

    regressor_without_alphas = RidgeRegression()
    regressor_with_alphas = RidgeRegression(
        np.array(sorted(list(generate_alphas(level1df).values()))))

    model1_without_alphas, cv_without_alphas = regressor_without_alphas.fit(
        level1df, labeldf, group2ids)
    model1df, cvdf = regressor_with_alphas.fit(level1df, labeldf, group2ids)
    __assert_dataframes_equal(model1_without_alphas, model1df)
    __assert_dataframes_equal(cv_without_alphas, cvdf)

    yhat_without_alphas = regressor_without_alphas.transform(level1df, labeldf, group2ids, model1df,
                                                             cvdf)
    yhatdf = regressor_with_alphas.transform(level1df, labeldf, group2ids, model1df, cvdf)
    assert yhat_without_alphas.equals(yhatdf)
예제 #6
0
def test_regression_fit_transform(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet').limit(5)
    group2ids = __get_sample_blocks(indexdf)

    regressor = RidgeRegression(alphas)
    model1df, cvdf = regressor.fit(level1df, labeldf, group2ids)
    yhatdf = regressor.transform(level1df, labeldf, group2ids, model1df, cvdf)
    fit_transform_df = regressor.fit_transform(level1df, labeldf, group2ids)

    assert fit_transform_df.equals(yhatdf)
예제 #7
0
def test_two_level_regression_with_cov(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    level2df = spark.read.parquet(f'{data_root}/level2BlockedGT.snappy.parquet')
    testLabel = 'sim100'

    group2ids = __get_sample_blocks(indexdf)
    bestAlpha, bestr2, y_hat = __calculate_y_hat(X2, group2ids, testLabel, covdf)

    regressor = RidgeRegression(alphas)
    model2df, cvdf = regressor.fit(level2df, labeldf, group2ids, covdf)
    yhatdf = regressor.transform(level2df, labeldf, group2ids, model2df, cvdf, covdf)

    r = cvdf.filter(f'label = "{testLabel}"').select('alpha', 'r2_mean').head()
    bestAlpha_lvl, bestr2_lvl = (r.alpha, r.r2_mean)
    y_hat_lvl = np.array(yhatdf[testLabel])

    assert (bestAlpha_lvl == bestAlpha and np.isclose(bestr2_lvl, bestr2) and
            np.allclose(y_hat_lvl, np.array(y_hat)))
예제 #8
0
def run(
    plink_path: str,
    traits_path: str,
    covariates_path: str,
    variants_per_block: int,
    sample_block_count: int,
    output_dir: str,
    plink_fam_sep: str = "\t",
    plink_bim_sep: str = "\t",
    alphas: Optional[list] = None,
    contigs: List[str] = None,
):
    """Run Glow WGR"""
    output_path = Path(output_dir)
    if output_path.exists():
        shutil.rmtree(output_path)
    output_path.mkdir(parents=True, exist_ok=False)

    if alphas is None:
        alphas = np.array([])
    else:
        alphas = np.array(alphas).astype(float)

    spark = spark_session()
    logger.info(
        f"Loading PLINK dataset at {plink_path} (fam sep = {plink_fam_sep}, bim sep = {plink_bim_sep}, alphas = {alphas})"
    )
    df = (spark.read.format("plink").option(
        "bimDelimiter",
        plink_bim_sep).option("famDelimiter", plink_fam_sep).option(
            "includeSampleIds", True).option("mergeFidIid",
                                             False).load(plink_path))

    variant_df = df.withColumn(
        "values", mean_substitute(genotype_states(F.col("genotypes")))).filter(
            F.size(F.array_distinct("values")) > 1)
    if contigs is not None:
        variant_df = variant_df.filter(F.col("contigName").isin(contigs))

    sample_ids = get_sample_ids(variant_df)
    logger.info(
        f"Found {len(sample_ids)} samples, first 10: {sample_ids[:10]}")

    ###########
    # Stage 1 #
    ###########

    logger.info(HR)
    logger.info("Calculating variant/sample block info")
    block_df, sample_blocks = block_variants_and_samples(
        variant_df,
        sample_ids,
        variants_per_block=variants_per_block,
        sample_block_count=sample_block_count,
    )

    label_df = pd.read_csv(traits_path, index_col="sample_id")
    label_df = (label_df - label_df.mean()) / label_df.std(ddof=0)
    logger.info(HR)
    logger.info("Trait info:")
    logger.info(_info(label_df))

    cov_df = pd.read_csv(covariates_path, index_col="sample_id")
    cov_df = (cov_df - cov_df.mean()) / cov_df.std(ddof=0)
    logger.info(HR)
    logger.info("Covariate info:")
    logger.info(_info(cov_df))

    stack = RidgeReducer(alphas=alphas)
    reduced_block_df = stack.fit_transform(block_df, label_df, sample_blocks,
                                           cov_df)
    logger.info(HR)
    logger.info("Stage 1: Reduced block schema:")
    logger.info(_schema(reduced_block_df))

    path = output_path / "reduced_blocks.parquet"
    reduced_block_df.write.parquet(str(path), mode="overwrite")
    logger.info(f"Stage 1: Reduced blocks written to {path}")

    # Flatten to scalars for more convenient access w/o Spark
    flat_reduced_block_df = spark.read.parquet(str(path))
    path = output_path / "reduced_blocks_flat.csv.gz"
    flat_reduced_block_df = _flatten_reduced_blocks(flat_reduced_block_df)
    flat_reduced_block_df = flat_reduced_block_df.toPandas()
    flat_reduced_block_df.to_csv(path, index=False)
    # flat_reduced_block_df.write.parquet(str(path), mode='overwrite')
    logger.info(f"Stage 1: Flattened reduced blocks written to {path}")

    ###########
    # Stage 2 #
    ###########

    # Monkey-patch this in until there's a glow release beyond 0.5.0
    if glow_version != "0.5.0":
        raise NotImplementedError(
            f"Must remove adjustements for glow != 0.5.0 (found {glow_version})"
        )
    # Remove after glow update
    RidgeRegression.transform_loco = transform_loco
    estimator = RidgeRegression(alphas=alphas)
    model_df, cv_df = estimator.fit(reduced_block_df, label_df, sample_blocks,
                                    cov_df)
    logger.info(HR)
    logger.info("Stage 2: Model schema:")
    logger.info(_schema(model_df))
    logger.info("Stage 2: CV schema:")
    logger.info(_schema(cv_df))

    y_hat_df = estimator.transform(reduced_block_df, label_df, sample_blocks,
                                   model_df, cv_df, cov_df)

    logger.info(HR)
    logger.info("Stage 2: Prediction info:")
    logger.info(_info(y_hat_df))
    logger.info(y_hat_df.head(5))

    path = output_path / "predictions.csv"
    y_hat_df.reset_index().to_csv(path, index=False)
    logger.info(f"Stage 2: Predictions written to {path}")

    y_hat_df_loco = estimator.transform_loco(reduced_block_df, label_df,
                                             sample_blocks, model_df, cv_df,
                                             cov_df)

    path = output_path / "predictions_loco.csv"
    y_hat_df_loco.reset_index().to_csv(path, index=False)
    logger.info(f"Stage 2: LOCO Predictions written to {path}")

    ###########
    # Stage 3 #
    ###########

    # Do this to correct for the error in Glow at https://github.com/projectglow/glow/issues/257
    if glow_version != "0.5.0":
        raise NotImplementedError(
            f"Must remove adjustements for glow != 0.5.0 (found {glow_version})"
        )
    cov_arr = cov_df.to_numpy()
    cov_arr = cov_arr.T.ravel(order="C").reshape(cov_arr.shape)

    # Convert the pandas dataframe into a Spark DataFrame
    adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df)

    # Run GWAS w/o LOCO (this could be for a much larger set of variants)
    wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").crossJoin(
        adjusted_phenotypes.withColumnRenamed(
            "values", "phenotypeValues")).select(
                "start",
                "names",
                "label",
                expand_struct(
                    linear_regression_gwas(F.col("callValues"),
                                           F.col("phenotypeValues"),
                                           F.lit(cov_arr))),
            ))

    logger.info(HR)
    logger.info("Stage 3: GWAS (no LOCO) schema:")
    logger.info(_schema(wgr_gwas))

    # Convert to pandas
    wgr_gwas = wgr_gwas.toPandas()
    logger.info(HR)
    logger.info("Stage 3: GWAS (no LOCO) info:")
    logger.info(_info(wgr_gwas))
    logger.info(wgr_gwas.head(5))

    path = output_path / "gwas.csv"
    wgr_gwas.to_csv(path, index=False)
    logger.info(f"Stage 3: GWAS (no LOCO) results written to {path}")
    logger.info(HR)
    logger.info("Done")

    # TODO: Enable this once WGR is fully released
    # See: https://github.com/projectglow/glow/issues/256)

    # Run GWAS w/ LOCO
    adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df_loco)
    wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").join(
        adjusted_phenotypes.withColumnRenamed("values", "phenotypeValues"),
        ["contigName"],
    ).select(
        "contigName",
        "start",
        "names",
        "label",
        expand_struct(
            linear_regression_gwas(F.col("callValues"),
                                   F.col("phenotypeValues"), F.lit(cov_arr))),
    ))

    # Convert to pandas
    wgr_gwas = wgr_gwas.toPandas()
    logger.info(HR)
    logger.info("Stage 3: GWAS (with LOCO) info:")
    logger.info(_info(wgr_gwas))
    logger.info(wgr_gwas.head(5))

    path = output_path / "gwas_loco.csv"
    wgr_gwas.to_csv(path, index=False)
    logger.info(f"Stage 3: GWAS (with LOCO) results written to {path}")
    logger.info(HR)
    logger.info("Done")