示例#1
0
def add_glow(doctest_namespace, spark):
    glow.register(spark, new_session=False)
    doctest_namespace['Row'] = Row
    doctest_namespace['spark'] = spark
    doctest_namespace['lit'] = functions.lit
    doctest_namespace['col'] = functions.col
    doctest_namespace['glow'] = glow
示例#2
0
def add_spark(doctest_namespace, spark):
    glow.register(spark)
    doctest_namespace['Row'] = Row
    doctest_namespace['spark'] = spark
    doctest_namespace['lit'] = functions.lit
    doctest_namespace['col'] = functions.col
    doctest_namespace['glow'] = glow
示例#3
0
def test_register(spark):
    glow.register(spark)
    df = spark.read.format("vcf") \
        .load("test-data/1kg_sample.vcf")
    stats = df.selectExpr("expand_struct(dp_summary_stats(genotypes))") \
            .select("min", "max") \
            .head()
    assert stats.asDict() == Row(min=1.0, max=23).asDict()
示例#4
0
def test_register(spark):
    glow.register(spark)
    row_one = Row(Row(str_col='foo', int_col=1, bool_col=True))
    row_two = Row(Row(str_col='bar', int_col=2, bool_col=False))
    df = spark.createDataFrame([row_one, row_two], schema=['base_col'])
    added_col_row = df.selectExpr("add_struct_fields(base_col, 'float_col', 3.14, 'rev_str_col', reverse(base_col.str_col)) as added_col") \
                      .filter("added_col.str_col = 'foo'") \
                      .head()
    assert added_col_row.added_col.rev_str_col == 'oof'
示例#5
0
# MAGIC #!/usr/bin/env bash
# MAGIC rm -r /opt/liftover
# MAGIC mkdir /opt/liftover
# MAGIC curl https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain --output /opt/liftover/b37ToHg38.over.chain
# MAGIC ```
# MAGIC In this demo, we perform coordinate and variant liftover from b37 to hg38.
# MAGIC
# MAGIC To perform variant liftover, you must download a reference file to each node of the cluster. Here, we use the FUSE mount to access the reference genome at
# MAGIC ```/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa```

# COMMAND ----------

# DBTITLE 1,Import glow and define path variables
import glow

spark = glow.register(spark)
chain_file = '/opt/liftover/b37ToHg38.over.chain'
reference_file = '/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
vcf_file = 'dbfs:/databricks-datasets/genomics/1kg-vcfs/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'

# COMMAND ----------

# DBTITLE 1,First, read in a VCF from a flat file or Delta Lake table.
input_df = (spark.read.format("vcf").load(vcf_file).limit(1).cache())

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC Now apply the `lift_over_coordinates` UDF, with the parameters as follows:
# MAGIC - chromosome (`string`)
示例#6
0
    root = sys.argv[1]
    freeze = sys.argv[2][1:-1] + "/"
    pheno = sys.argv[3]
    covar = sys.argv[4]
    split = sys.argv[5]
    offsets = sys.argv[6]
    jobname = sys.argv[7]
    splitctg = sys.argv[8]
    repart = sys.argv[9]

    spark = SparkSession\
        .builder\
        .appName(jobname)\
        .getOrCreate()

    glow.register(spark, False)
    spark.udf.registerJavaFunction("chartodoublearray",
                                   "org.gorpipe.spark.udfs.CharToDoubleArray",
                                   ArrayType(DoubleType()))

    rootfreeze = root + freeze

    label_df = pd.read_csv(root + pheno, sep='\t', index_col=0)
    covariate_df = None
    if len(covar) > 0:
        covariates = pd.read_csv(root + covar, sep='\t', index_col=0)
        covariate_df = covariates.fillna(covariates.mean())
        covariate_df = (covariate_df -
                        covariate_df.mean()) / covariate_df.std()
        covariate_df
示例#7
0
def register_glow(spark):
    glow.register(spark, new_session=False)
示例#8
0
spark = (
    SparkSession.builder
    .appName('desmi_inject_gnomad')
    .config("spark.jars.packages", ",".join([
        "io.projectglow:glow-spark3_2.12:1.0.0",
    ]))
    .config("spark.local.dir", os.environ.get("TMP"))
    .config("spark.master", f"local[{N_CPU},{MAX_FAILURES}]")
    .config("spark.sql.shuffle.partitions", "2001")
    .config("spark.sql.execution.arrow.enabled", "true")
    .config("spark.driver.maxResultSize", "48G")
    .config("spark.task.maxFailures", MAX_FAILURES)
    .getOrCreate()
)
glow.register(spark)
spark
# -


INPUT_VCF  = snakemake.input["vcf"]
INPUT_VCF

OUTPUT_PQ = snakemake.output["vep"]
OUTPUT_PQ

# +
FASTA=snakemake.input["fasta"]
GTF=snakemake.input["gtf"]

HUMAN_GENOME_VERSION=snakemake.params["human_genome_version"]
示例#9
0
def spark_session():
    spark = (SparkSession.builder.config(
        "spark.jars.packages", "io.projectglow:glow_2.11:0.5.0").config(
            "spark.sql.execution.arrow.pyspark.enabled", "true").getOrCreate())
    glow.register(spark)
    return spark
示例#10
0
def test_new_session(spark):
    sess = glow.register(spark, new_session=False)
    assert sess._jsparkSession.equals(spark._jsparkSession)

    sess = glow.register(spark, new_session=True)
    assert not sess._jsparkSession.equals(spark._jsparkSession)