Пример #1
0
def test_transform(spark):
    df = spark.read.format("vcf")\
        .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf")
    converted = glow.transform("pipe",
                               df,
                               input_formatter="vcf",
                               output_formatter="vcf",
                               cmd='["cat"]',
                               in_vcf_header="infer")
    assert converted.count() == 1075
Пример #2
0
def test_arg_map(spark):
    df = spark.read.format("vcf") \
        .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf")
    args = {
        "inputFormatter": "vcf",
        "outputFormatter": "vcf",
        "cmd": '["cat"]',
        "in_vcfHeader": "infer"
    }
    converted = glow.transform("pipe", df, args)
    assert converted.count() == 1075
Пример #3
0
    def _transform(
        self,
        input_df: f.DataFrame,
        contig: pyspark.sql.column.Column,
        start: pyspark.sql.column.Column,
        end: pyspark.sql.column.Column,
        ref: pyspark.sql.column.Column,
        alt: pyspark.sql.column.Column,
        id: pyspark.sql.column.Column,
    ):
        """
        Runs Ensembl VEP on a Spark DataFrame with VEP.
        The DataFrame needs to provide the following fields:
            - "contigName"
            - "start"
            - "end
            - "referenceAllele"
            - "alternateAlleles"

        Args:
            df: Spark DataFrame with contigNamem start, end, ref and alt
            contig: contig name column
            start: variant position column
            ref: reference allele column
            alt: array of alternate alleles column
            id: array of id's

        Returns:
            Spark DataFrame with single column `text` that contains json-formatted VEP output as string
        """
        import glow
        input_df = input_df.select([
            contig,
            start,
            end,
            id,
            ref,
            alt,
        ])

        vep_transformed_df = glow.transform(
            "pipe",
            input_df,
            cmd=json.dumps(self.call_args),
            inputFormatter='vcf',
            inVcfHeader='infer',
            outputFormatter='text',
        )

        return vep_transformed_df
Пример #4
0
# DBTITLE 1,Filter rows for which liftover succeeded and see which rows changed.
changed_with_lifted_df = input_with_lifted_df.filter(
    "lifted is not null").filter("start != lifted.start")
display(changed_with_lifted_df)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC Now apply the `lift_over_variants` transformer, with the following options.
# MAGIC - `chain_file`: `string`
# MAGIC - `reference_file`: `string`
# MAGIC - `min_match_ratio`: `double` (optional, defaults to `.95`)

# COMMAND ----------

output_df = glow.transform('lift_over_variants',
                           input_df,
                           chain_file=chain_file,
                           reference_file=reference_file)

# COMMAND ----------

# DBTITLE 1,View the rows for which liftover succeeded
lifted_df = output_df.filter('liftOverStatus.success = true').drop(
    'liftOverStatus')
display(
    lifted_df.select('contigName', 'start', 'end', 'referenceAllele',
                     'alternateAlleles', 'INFO_AC', 'INFO_SwappedAlleles',
                     'INFO_ReverseComplementedAlleles'))
Пример #5
0
# Databricks notebook source
import pyspark.sql.functions as fx
from pyspark.sql.types import *
import glow
spark = glow.register(spark)
import json

# COMMAND ----------

# DBTITLE 1,Use the text input and output formatters
df = spark.createDataFrame([["foo"], ["bar"], ["baz"]], ["text"])
display(
    glow.transform('pipe',
                   df,
                   cmd=['rev'],
                   input_formatter='text',
                   output_formatter='text'))

# COMMAND ----------

# DBTITLE 1,Read 1kg chr22
df = spark.read.format("vcf").option(
    "flattenInfoFields",
    False).load("/databricks-datasets/genomics/1kg-vcfs/*.vcf.gz")
df = sqlContext.createDataFrame(sc.parallelize(df.take(1000)),
                                df.schema).cache()

# COMMAND ----------

# DBTITLE 1,Use grep to drop INFO lines from VCF header
transformed_df = glow.transform('pipe',
Пример #6
0
lmm_udf = fx.pandas_udf(lmm, returnType=DoubleType())

# COMMAND ----------

# DBTITLE 1,Prepare the input DataFrame
"""
Read in 1000genomes phase 3 chr 22 and split multiallelic sites to biallelic.

Add the phenotypes by cross joining with the genomic DataFrame.

The input to the lmm is the genotype represented as the number of alt alleles (0, 1, or 2).
In this example, we remove all sites where some samples are missing (as represented by -1).
"""

df = glow.transform( \
         "split_multiallelics", \
         spark.read.format("vcf").load("/databricks-datasets/genomics/1kg-vcfs/*chr22*.vcf.gz") \
     ) \
     .crossJoin(spark.read.format("parquet").load("/databricks-datasets/genomics/1000G/phenotypes.normalized/")) \
     .withColumn('genotype_states', fx.expr("genotype_states(genotypes)")) \
     .where(~fx.array_contains(fx.col('genotype_states'), -1))

# COMMAND ----------

# DBTITLE 1,Run the UDF and display results
by_pvalue = df.limit(1000).select("contigName", "start", "names", lmm_udf(df['genotype_states'], df['values']).alias("pValue"))\
  .na.drop(subset=["pValue"])\
  .orderBy("pValue", ascending=True)

display(by_pvalue)
Пример #7
0
sample_ids = glow.wgr.get_sample_ids(base_variant_df)

# COMMAND ----------

# MAGIC %md
# MAGIC To prepare the data for analysis, we perform the following transformations:
# MAGIC - Split multiallelic variants with the ``split_multiallelics`` transformer.
# MAGIC - Calculate the number of alternate alleles for biallelic variants with `genotype_states`.
# MAGIC - Replace any missing values with the mean of the non-missing values using `mean_substitute`.
# MAGIC - Filter out all homozygous SNPs.

# COMMAND ----------

variant_df = (glow.transform(
    'split_multiallelics', base_variant_df).withColumn(
        'values',
        glow.mean_substitute(glow.genotype_states('genotypes'))).filter(
            fx.size(fx.array_distinct('values')) > 1))

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`.
# MAGIC
# MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches.

# COMMAND ----------

block_df, sample_blocks = glow.wgr.block_variants_and_samples(
    variant_df, sample_ids, variants_per_block, sample_block_count)
# Databricks notebook source
# DBTITLE 1,Define path variables
import glow
spark = glow.register(spark)
vcf_path = '/databricks-datasets/genomics/variant-splitting/01_IN_altered_multiallelic.vcf'

# COMMAND ----------

# DBTITLE 1,Load a VCF into a DataFrame
original_variants_df = (spark.read.format("vcf").option(
    "includeSampleIds", False).option("flattenInfoFields",
                                      True).load(vcf_path))

# COMMAND ----------

# DBTITLE 1,Display
display(original_variants_df)

# COMMAND ----------

# DBTITLE 1,Split multi-allelic variants
spark.conf.set(
    "spark.sql.codegen.wholeStage", False
)  # turn off Spark SQL whole-stage code generation for faster performance.

split_variants_df = glow.transform("split_multiallelics", original_variants_df)

display(split_variants_df)
Пример #9
0
# COMMAND ----------

vcf.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC #### split multiallelic events

# COMMAND ----------

spark.conf.set(
    "spark.sql.codegen.wholeStage", False
)  # turn off Spark SQL whole-stage code generation for faster performance.
split_vcf = glow.transform("split_multiallelics", vcf)

# COMMAND ----------

split_vcf.show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC #### normalize variants
# MAGIC
# MAGIC This is an important quality control / sanity check when ingesting VCFs
# MAGIC
# MAGIC And is always necessary after multiallelics variants are split to biallelics
Пример #10
0
#    --plugin dbscSNV
#    --custom ${GNOMAD_VCF},Gnomad_2.1.1,vcf,overlap
#    --plugin AncestralAllele

print(vep_cmd)
# -

df = (
    spark
    .read
    .option("flattenInfoFields", False)
    .format('vcf')
    .load(INPUT_VCF)
)

df = glow.transform("split_multiallelics", df)

df.printSchema()

df = df.withColumn("names", f.array([f.concat(
    f.col('contigName'),
    f.lit(":"),
    f.col('start') + 1,
    f.lit(":"),
    f.col('referenceAllele'),
    f.lit(">"),
    f.col('alternateAlleles')[0]
)]))

df.limit(10).toPandas()
Пример #11
0
def test_no_transform(spark):
    df = spark.read.format("vcf") \
        .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf")
    with pytest.raises(IllegalArgumentException):
        glow.transform("dne", df)
Пример #12
0
original_variants_df = (spark.read
  .format("vcf")
  .option("includeSampleIds", False)
  .load(vcf_path))

# COMMAND ----------

# DBTITLE 1,Display
display(original_variants_df)

# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variants transformer with column replacement
normalized_variants_df = glow.transform(
  "normalize_variants",
  original_variants_df,
  reference_genome_path=ref_genome_path
)

display(normalized_variants_df)

# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variants transformer without column replacement
normalized_variants_df = glow.transform(
  "normalize_variants",
  original_variants_df,
  reference_genome_path=ref_genome_path,
  replace_columns="False"
)
Пример #13
0
# MAGIC write genotype data into Delta Lake, a high performance big data store with ACID semantics.
# MAGIC Delta Lake organizes, indexes and compresses data, allowing for performant and reliable computation on genomics data as it grows over time.

# COMMAND ----------

vcf_view_unsplit = (spark.read.format("vcf")
   .option("flattenInfoFields", "false")
   .load(vcf_path))

# COMMAND ----------

# MAGIC %md Split multiallelics variants to biallelics

# COMMAND ----------

vcf_view = glow.transform("split_multiallelics", vcf_view_unsplit)

# COMMAND ----------

display(vcf_view.withColumn("genotypes", fx.col("genotypes")[0]))

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Note: we compute variant-wise summary stats and Hardy-Weinberg equilibrium P values using `call_summary_stats` & `hardy_weinberg`, which are built into Glow

# COMMAND ----------

(vcf_view
  .select(
    fx.expr("*"),