Python transform примеры, glow.transform Python примеры использования

Пример #1

0

Показать файл

Файл: test_transform.py Проект: goodbright2014/glow-1

def test_transform(spark):
    df = spark.read.format("vcf")\
        .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf")
    converted = glow.transform("pipe",
                               df,
                               input_formatter="vcf",
                               output_formatter="vcf",
                               cmd='["cat"]',
                               in_vcf_header="infer")
    assert converted.count() == 1075

Пример #2

0

Показать файл

Файл: test_transform.py Проект: null-sleep/glow

def test_arg_map(spark):
    df = spark.read.format("vcf") \
        .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf")
    args = {
        "inputFormatter": "vcf",
        "outputFormatter": "vcf",
        "cmd": '["cat"]',
        "in_vcfHeader": "infer"
    }
    converted = glow.transform("pipe", df, args)
    assert converted.count() == 1075

Пример #3

0

Показать файл

Файл: vep.py Проект: Hoeze/firefly

    def _transform(
        self,
        input_df: f.DataFrame,
        contig: pyspark.sql.column.Column,
        start: pyspark.sql.column.Column,
        end: pyspark.sql.column.Column,
        ref: pyspark.sql.column.Column,
        alt: pyspark.sql.column.Column,
        id: pyspark.sql.column.Column,
    ):
        """
        Runs Ensembl VEP on a Spark DataFrame with VEP.
        The DataFrame needs to provide the following fields:
            - "contigName"
            - "start"
            - "end
            - "referenceAllele"
            - "alternateAlleles"

        Args:
            df: Spark DataFrame with contigNamem start, end, ref and alt
            contig: contig name column
            start: variant position column
            ref: reference allele column
            alt: array of alternate alleles column
            id: array of id's

        Returns:
            Spark DataFrame with single column `text` that contains json-formatted VEP output as string
        """
        import glow
        input_df = input_df.select([
            contig,
            start,
            end,
            id,
            ref,
            alt,
        ])

        vep_transformed_df = glow.transform(
            "pipe",
            input_df,
            cmd=json.dumps(self.call_args),
            inputFormatter='vcf',
            inVcfHeader='infer',
            outputFormatter='text',
        )

        return vep_transformed_df

Пример #4

0

Показать файл

# DBTITLE 1,Filter rows for which liftover succeeded and see which rows changed.
changed_with_lifted_df = input_with_lifted_df.filter(
    "lifted is not null").filter("start != lifted.start")
display(changed_with_lifted_df)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC Now apply the `lift_over_variants` transformer, with the following options.
# MAGIC - `chain_file`: `string`
# MAGIC - `reference_file`: `string`
# MAGIC - `min_match_ratio`: `double` (optional, defaults to `.95`)

# COMMAND ----------

output_df = glow.transform('lift_over_variants',
                           input_df,
                           chain_file=chain_file,
                           reference_file=reference_file)

# COMMAND ----------

# DBTITLE 1,View the rows for which liftover succeeded
lifted_df = output_df.filter('liftOverStatus.success = true').drop(
    'liftOverStatus')
display(
    lifted_df.select('contigName', 'start', 'end', 'referenceAllele',
                     'alternateAlleles', 'INFO_AC', 'INFO_SwappedAlleles',
                     'INFO_ReverseComplementedAlleles'))

Пример #5

0

Показать файл

# Databricks notebook source
import pyspark.sql.functions as fx
from pyspark.sql.types import *
import glow
spark = glow.register(spark)
import json

# COMMAND ----------

# DBTITLE 1,Use the text input and output formatters
df = spark.createDataFrame([["foo"], ["bar"], ["baz"]], ["text"])
display(
    glow.transform('pipe',
                   df,
                   cmd=['rev'],
                   input_formatter='text',
                   output_formatter='text'))

# COMMAND ----------

# DBTITLE 1,Read 1kg chr22
df = spark.read.format("vcf").option(
    "flattenInfoFields",
    False).load("/databricks-datasets/genomics/1kg-vcfs/*.vcf.gz")
df = sqlContext.createDataFrame(sc.parallelize(df.take(1000)),
                                df.schema).cache()

# COMMAND ----------

# DBTITLE 1,Use grep to drop INFO lines from VCF header
transformed_df = glow.transform('pipe',

Пример #6

0

Показать файл

lmm_udf = fx.pandas_udf(lmm, returnType=DoubleType())

# COMMAND ----------

# DBTITLE 1,Prepare the input DataFrame
"""
Read in 1000genomes phase 3 chr 22 and split multiallelic sites to biallelic.

Add the phenotypes by cross joining with the genomic DataFrame.

The input to the lmm is the genotype represented as the number of alt alleles (0, 1, or 2).
In this example, we remove all sites where some samples are missing (as represented by -1).
"""

df = glow.transform( \
         "split_multiallelics", \
         spark.read.format("vcf").load("/databricks-datasets/genomics/1kg-vcfs/*chr22*.vcf.gz") \
     ) \
     .crossJoin(spark.read.format("parquet").load("/databricks-datasets/genomics/1000G/phenotypes.normalized/")) \
     .withColumn('genotype_states', fx.expr("genotype_states(genotypes)")) \
     .where(~fx.array_contains(fx.col('genotype_states'), -1))

# COMMAND ----------

# DBTITLE 1,Run the UDF and display results
by_pvalue = df.limit(1000).select("contigName", "start", "names", lmm_udf(df['genotype_states'], df['values']).alias("pValue"))\
  .na.drop(subset=["pValue"])\
  .orderBy("pValue", ascending=True)

display(by_pvalue)

Пример #7

0

Показать файл

sample_ids = glow.wgr.get_sample_ids(base_variant_df)

# COMMAND ----------

# MAGIC %md
# MAGIC To prepare the data for analysis, we perform the following transformations:
# MAGIC - Split multiallelic variants with the ``split_multiallelics`` transformer.
# MAGIC - Calculate the number of alternate alleles for biallelic variants with `genotype_states`.
# MAGIC - Replace any missing values with the mean of the non-missing values using `mean_substitute`.
# MAGIC - Filter out all homozygous SNPs.

# COMMAND ----------

variant_df = (glow.transform(
    'split_multiallelics', base_variant_df).withColumn(
        'values',
        glow.mean_substitute(glow.genotype_states('genotypes'))).filter(
            fx.size(fx.array_distinct('values')) > 1))

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`.
# MAGIC
# MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches.

# COMMAND ----------

block_df, sample_blocks = glow.wgr.block_variants_and_samples(
    variant_df, sample_ids, variants_per_block, sample_block_count)

Пример #8

0

Показать файл

Файл: splitmultiallelics-transformer.py Проект: projectglow/glow

# Databricks notebook source
# DBTITLE 1,Define path variables
import glow
spark = glow.register(spark)
vcf_path = '/databricks-datasets/genomics/variant-splitting/01_IN_altered_multiallelic.vcf'

# COMMAND ----------

# DBTITLE 1,Load a VCF into a DataFrame
original_variants_df = (spark.read.format("vcf").option(
    "includeSampleIds", False).option("flattenInfoFields",
                                      True).load(vcf_path))

# COMMAND ----------

# DBTITLE 1,Display
display(original_variants_df)

# COMMAND ----------

# DBTITLE 1,Split multi-allelic variants
spark.conf.set(
    "spark.sql.codegen.wholeStage", False
)  # turn off Spark SQL whole-stage code generation for faster performance.

split_variants_df = glow.transform("split_multiallelics", original_variants_df)

display(split_variants_df)

Пример #9

0

Показать файл

# COMMAND ----------

vcf.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC #### split multiallelic events

# COMMAND ----------

spark.conf.set(
    "spark.sql.codegen.wholeStage", False
)  # turn off Spark SQL whole-stage code generation for faster performance.
split_vcf = glow.transform("split_multiallelics", vcf)

# COMMAND ----------

split_vcf.show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC #### normalize variants
# MAGIC
# MAGIC This is an important quality control / sanity check when ingesting VCFs
# MAGIC
# MAGIC And is always necessary after multiallelics variants are split to biallelics

Пример #10

0

Показать файл

#    --plugin dbscSNV
#    --custom ${GNOMAD_VCF},Gnomad_2.1.1,vcf,overlap
#    --plugin AncestralAllele

print(vep_cmd)
# -

df = (
    spark
    .read
    .option("flattenInfoFields", False)
    .format('vcf')
    .load(INPUT_VCF)
)

df = glow.transform("split_multiallelics", df)

df.printSchema()

df = df.withColumn("names", f.array([f.concat(
    f.col('contigName'),
    f.lit(":"),
    f.col('start') + 1,
    f.lit(":"),
    f.col('referenceAllele'),
    f.lit(">"),
    f.col('alternateAlleles')[0]
)]))

df.limit(10).toPandas()

Пример #11

0

Показать файл

Файл: test_transform.py Проект: null-sleep/glow

def test_no_transform(spark):
    df = spark.read.format("vcf") \
        .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf")
    with pytest.raises(IllegalArgumentException):
        glow.transform("dne", df)

Пример #12

0

Показать файл

Файл: normalizevariants.py Проект: pantonim11/glow

original_variants_df = (spark.read
  .format("vcf")
  .option("includeSampleIds", False)
  .load(vcf_path))

# COMMAND ----------

# DBTITLE 1,Display
display(original_variants_df)

# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variants transformer with column replacement
normalized_variants_df = glow.transform(
  "normalize_variants",
  original_variants_df,
  reference_genome_path=ref_genome_path
)

display(normalized_variants_df)

# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variants transformer without column replacement
normalized_variants_df = glow.transform(
  "normalize_variants",
  original_variants_df,
  reference_genome_path=ref_genome_path,
  replace_columns="False"
)

Пример #13

0

Показать файл

Файл: gwas.py Проект: pantonim11/glow

# MAGIC write genotype data into Delta Lake, a high performance big data store with ACID semantics.
# MAGIC Delta Lake organizes, indexes and compresses data, allowing for performant and reliable computation on genomics data as it grows over time.

# COMMAND ----------

vcf_view_unsplit = (spark.read.format("vcf")
   .option("flattenInfoFields", "false")
   .load(vcf_path))

# COMMAND ----------

# MAGIC %md Split multiallelics variants to biallelics

# COMMAND ----------

vcf_view = glow.transform("split_multiallelics", vcf_view_unsplit)

# COMMAND ----------

display(vcf_view.withColumn("genotypes", fx.col("genotypes")[0]))

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Note: we compute variant-wise summary stats and Hardy-Weinberg equilibrium P values using `call_summary_stats` & `hardy_weinberg`, which are built into Glow

# COMMAND ----------

(vcf_view
  .select(
    fx.expr("*"),

Python transform примеры использования