def test_tie_break(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet').limit(5) group2ids = __get_sample_blocks(indexdf) regressor = RidgeRegression(np.array([0.1, 0.2, 0.1, 0.2])) _, cvdf = regressor.fit(level1df, labeldf, group2ids) assert cvdf.count() == len(labeldf.columns)
def test_two_level_regression_transform_loco_infer_contigs(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level2df = spark.read.parquet(f'{data_root}/level2BlockedGT.snappy.parquet') group2ids = __get_sample_blocks(indexdf) regressor = RidgeRegression(alphas) model1df, cvdf = regressor.fit(level2df, labeldf, group2ids) y_hat = regressor.transform_loco(level2df, labeldf, group2ids, model1df, cvdf) pd.testing.assert_frame_equal(y_hat, level2_yhat_loco_df)
def test_regression_fit_transform(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet').limit(5) group2ids = __get_sample_blocks(indexdf) regressor = RidgeRegression(alphas) model1df, cvdf = regressor.fit(level1df, labeldf, group2ids) yhatdf = regressor.transform(level1df, labeldf, group2ids, model1df, cvdf) fit_transform_df = regressor.fit_transform(level1df, labeldf, group2ids) assert fit_transform_df.equals(yhatdf)
def test_two_level_regression_with_cov(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level2df = spark.read.parquet(f'{data_root}/level2BlockedGT.snappy.parquet') testLabel = 'sim100' group2ids = __get_sample_blocks(indexdf) bestAlpha, bestr2, y_hat = __calculate_y_hat(X2, group2ids, testLabel, covdf) regressor = RidgeRegression(alphas) model2df, cvdf = regressor.fit(level2df, labeldf, group2ids, covdf) yhatdf = regressor.transform(level2df, labeldf, group2ids, model2df, cvdf, covdf) r = cvdf.filter(f'label = "{testLabel}"').select('alpha', 'r2_mean').head() bestAlpha_lvl, bestr2_lvl = (r.alpha, r.r2_mean) y_hat_lvl = np.array(yhatdf[testLabel]) assert (bestAlpha_lvl == bestAlpha and np.isclose(bestr2_lvl, bestr2) and np.allclose(y_hat_lvl, np.array(y_hat)))
def test_regression_generate_alphas(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet') group2ids = __get_sample_blocks(indexdf) regressor_fit = RidgeRegression() regressor_transform = RidgeRegression() model1df, cvdf = regressor_fit.fit(level1df, labeldf, group2ids) with pytest.raises(Exception): y_hat = regressor_transform.transform(level1df, labeldf, group2ids, model1df, cvdf)
def test_regression_generate_alphas(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level1df = spark.read.parquet(f'{data_root}/level1BlockedGT.snappy.parquet').limit(5) group2ids = __get_sample_blocks(indexdf) regressor_without_alphas = RidgeRegression() regressor_with_alphas = RidgeRegression( np.array(sorted(list(generate_alphas(level1df).values())))) model1_without_alphas, cv_without_alphas = regressor_without_alphas.fit( level1df, labeldf, group2ids) model1df, cvdf = regressor_with_alphas.fit(level1df, labeldf, group2ids) __assert_dataframes_equal(model1_without_alphas, model1df) __assert_dataframes_equal(cv_without_alphas, cvdf) yhat_without_alphas = regressor_without_alphas.transform(level1df, labeldf, group2ids, model1df, cvdf) yhatdf = regressor_with_alphas.transform(level1df, labeldf, group2ids, model1df, cvdf) assert yhat_without_alphas.equals(yhatdf)
def test_regression_transform_validates_inputs(spark): indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') level1df = spark.read.parquet( f'{data_root}/level1BlockedGT.snappy.parquet').limit(5) group2ids = __get_sample_blocks(indexdf) regressor = RidgeRegression(alphas) model1df, cvdf = regressor.fit(level1df, labeldf, group2ids, covdf) with pytest.raises(ValueError): regressor.transform(level1df, label_with_missing, group2ids, model1df, cvdf, covdf) with pytest.raises(ValueError): regressor.transform(level1df, labeldf, group2ids, model1df, cvdf, covdf_with_missing) with pytest.warns(UserWarning): regressor.transform(level1df, labeldf + 0.5, group2ids, model1df, cvdf, covdf) with pytest.warns(UserWarning): regressor.transform(level1df, labeldf * 1.5, group2ids, model1df, cvdf, covdf) with pytest.warns(UserWarning): regressor.transform(level1df, labeldf, group2ids, model1df, cvdf, covdf + 0.5) with pytest.warns(UserWarning): regressor.transform(level1df, labeldf, group2ids, model1df, cvdf, covdf * 1.5) # Should issue no warnings regressor.transform(level1df, labeldf, group2ids, model1df, cvdf, covdf)
def run( plink_path: str, traits_path: str, covariates_path: str, variants_per_block: int, sample_block_count: int, output_dir: str, plink_fam_sep: str = "\t", plink_bim_sep: str = "\t", alphas: Optional[list] = None, contigs: List[str] = None, ): """Run Glow WGR""" output_path = Path(output_dir) if output_path.exists(): shutil.rmtree(output_path) output_path.mkdir(parents=True, exist_ok=False) if alphas is None: alphas = np.array([]) else: alphas = np.array(alphas).astype(float) spark = spark_session() logger.info( f"Loading PLINK dataset at {plink_path} (fam sep = {plink_fam_sep}, bim sep = {plink_bim_sep}, alphas = {alphas})" ) df = (spark.read.format("plink").option( "bimDelimiter", plink_bim_sep).option("famDelimiter", plink_fam_sep).option( "includeSampleIds", True).option("mergeFidIid", False).load(plink_path)) variant_df = df.withColumn( "values", mean_substitute(genotype_states(F.col("genotypes")))).filter( F.size(F.array_distinct("values")) > 1) if contigs is not None: variant_df = variant_df.filter(F.col("contigName").isin(contigs)) sample_ids = get_sample_ids(variant_df) logger.info( f"Found {len(sample_ids)} samples, first 10: {sample_ids[:10]}") ########### # Stage 1 # ########### logger.info(HR) logger.info("Calculating variant/sample block info") block_df, sample_blocks = block_variants_and_samples( variant_df, sample_ids, variants_per_block=variants_per_block, sample_block_count=sample_block_count, ) label_df = pd.read_csv(traits_path, index_col="sample_id") label_df = (label_df - label_df.mean()) / label_df.std(ddof=0) logger.info(HR) logger.info("Trait info:") logger.info(_info(label_df)) cov_df = pd.read_csv(covariates_path, index_col="sample_id") cov_df = (cov_df - cov_df.mean()) / cov_df.std(ddof=0) logger.info(HR) logger.info("Covariate info:") logger.info(_info(cov_df)) stack = RidgeReducer(alphas=alphas) reduced_block_df = stack.fit_transform(block_df, label_df, sample_blocks, cov_df) logger.info(HR) logger.info("Stage 1: Reduced block schema:") logger.info(_schema(reduced_block_df)) path = output_path / "reduced_blocks.parquet" reduced_block_df.write.parquet(str(path), mode="overwrite") logger.info(f"Stage 1: Reduced blocks written to {path}") # Flatten to scalars for more convenient access w/o Spark flat_reduced_block_df = spark.read.parquet(str(path)) path = output_path / "reduced_blocks_flat.csv.gz" flat_reduced_block_df = _flatten_reduced_blocks(flat_reduced_block_df) flat_reduced_block_df = flat_reduced_block_df.toPandas() flat_reduced_block_df.to_csv(path, index=False) # flat_reduced_block_df.write.parquet(str(path), mode='overwrite') logger.info(f"Stage 1: Flattened reduced blocks written to {path}") ########### # Stage 2 # ########### # Monkey-patch this in until there's a glow release beyond 0.5.0 if glow_version != "0.5.0": raise NotImplementedError( f"Must remove adjustements for glow != 0.5.0 (found {glow_version})" ) # Remove after glow update RidgeRegression.transform_loco = transform_loco estimator = RidgeRegression(alphas=alphas) model_df, cv_df = estimator.fit(reduced_block_df, label_df, sample_blocks, cov_df) logger.info(HR) logger.info("Stage 2: Model schema:") logger.info(_schema(model_df)) logger.info("Stage 2: CV schema:") logger.info(_schema(cv_df)) y_hat_df = estimator.transform(reduced_block_df, label_df, sample_blocks, model_df, cv_df, cov_df) logger.info(HR) logger.info("Stage 2: Prediction info:") logger.info(_info(y_hat_df)) logger.info(y_hat_df.head(5)) path = output_path / "predictions.csv" y_hat_df.reset_index().to_csv(path, index=False) logger.info(f"Stage 2: Predictions written to {path}") y_hat_df_loco = estimator.transform_loco(reduced_block_df, label_df, sample_blocks, model_df, cv_df, cov_df) path = output_path / "predictions_loco.csv" y_hat_df_loco.reset_index().to_csv(path, index=False) logger.info(f"Stage 2: LOCO Predictions written to {path}") ########### # Stage 3 # ########### # Do this to correct for the error in Glow at https://github.com/projectglow/glow/issues/257 if glow_version != "0.5.0": raise NotImplementedError( f"Must remove adjustements for glow != 0.5.0 (found {glow_version})" ) cov_arr = cov_df.to_numpy() cov_arr = cov_arr.T.ravel(order="C").reshape(cov_arr.shape) # Convert the pandas dataframe into a Spark DataFrame adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df) # Run GWAS w/o LOCO (this could be for a much larger set of variants) wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").crossJoin( adjusted_phenotypes.withColumnRenamed( "values", "phenotypeValues")).select( "start", "names", "label", expand_struct( linear_regression_gwas(F.col("callValues"), F.col("phenotypeValues"), F.lit(cov_arr))), )) logger.info(HR) logger.info("Stage 3: GWAS (no LOCO) schema:") logger.info(_schema(wgr_gwas)) # Convert to pandas wgr_gwas = wgr_gwas.toPandas() logger.info(HR) logger.info("Stage 3: GWAS (no LOCO) info:") logger.info(_info(wgr_gwas)) logger.info(wgr_gwas.head(5)) path = output_path / "gwas.csv" wgr_gwas.to_csv(path, index=False) logger.info(f"Stage 3: GWAS (no LOCO) results written to {path}") logger.info(HR) logger.info("Done") # TODO: Enable this once WGR is fully released # See: https://github.com/projectglow/glow/issues/256) # Run GWAS w/ LOCO adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df_loco) wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").join( adjusted_phenotypes.withColumnRenamed("values", "phenotypeValues"), ["contigName"], ).select( "contigName", "start", "names", "label", expand_struct( linear_regression_gwas(F.col("callValues"), F.col("phenotypeValues"), F.lit(cov_arr))), )) # Convert to pandas wgr_gwas = wgr_gwas.toPandas() logger.info(HR) logger.info("Stage 3: GWAS (with LOCO) info:") logger.info(_info(wgr_gwas)) logger.info(wgr_gwas.head(5)) path = output_path / "gwas_loco.csv" wgr_gwas.to_csv(path, index=False) logger.info(f"Stage 3: GWAS (with LOCO) results written to {path}") logger.info(HR) logger.info("Done")