예제 #1
0
    def main(self, sc: SparkContext, *args):
        """
        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
        """
        spark = SparkSession(sc)

        # Parsing app options
        gp_parquet_path = args[0]
        output_path = args[1]

        gp_df = spark.read.parquet(gp_parquet_path)
        explode_cols = ["procedure_stable_id", "procedure_name", "project_name"]

        for col_name in explode_cols:
            gp_df = gp_df.withColumn(col_name, explode(col_name))
        gp_df = gp_df.select(
            "marker_accession_id",
            "pipeline_stable_id",
            "procedure_stable_id",
            "procedure_name",
            "parameter_stable_id",
            "parameter_name",
            "allele_accession_id",
            "allele_name",
            "allele_symbol",
            "zygosity",
            "phenotyping_center",
            "sex",
            "project_name",
            "p_value",
            "life_stage_name",
            "effect_size",
            "mp_term_id",
            "mp_term_name",
            "top_level_mp_term_id",
            "top_level_mp_term_name",
        )

        gp_df = gp_df.withColumn(
            "phenotype",
            struct(col("mp_term_id").alias("id"), col("mp_term_name").alias("name")),
        )

        gp_df = gp_df.withColumn(
            "topLevelPhenotype",
            zip_with(
                "top_level_mp_term_id",
                "top_level_mp_term_name",
                lambda x, y: struct(x.alias("id"), y.alias("name")),
            ),
        )

        gp_df = gp_df.drop(
            "mp_term_id",
            "mp_term_name",
            "top_level_mp_term_id",
            "top_level_mp_term_name",
        )

        gp_df = gp_df.withColumnRenamed("marker_accession_id", "geneAccessionId")
        gp_df = gp_df.withColumn("id", col("geneAccessionId"))

        for col_name in gp_df.columns:
            gp_df = gp_df.withColumnRenamed(col_name, to_camel_case(col_name))

        gp_df = gp_df.groupBy("id").agg(
            collect_set(
                struct(*[col_name for col_name in gp_df.columns if col_name != "id"])
            ).alias("significantPhenotypes")
        )

        gp_df.write.partitionBy("id").json(output_path)
예제 #2
0
    def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        if isinstance(right, (list, tuple)):
            from pyspark.pandas.series import first_series, scol_for
            from pyspark.pandas.frame import DataFrame
            from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField

            len_right = len(right)
            if len(left) != len(right):
                raise ValueError("Lengths must be equal")

            sdf = left._internal.spark_frame
            structed_scol = F.struct(
                sdf[NATURAL_ORDER_COLUMN_NAME],
                *left._internal.index_spark_columns,
                left.spark.column,
            )
            # The size of the list is expected to be small.
            collected_structed_scol = F.collect_list(structed_scol)
            # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order.
            collected_structed_scol = F.array_sort(collected_structed_scol)
            right_values_scol = F.array(*(F.lit(x) for x in right))
            index_scol_names = left._internal.index_spark_column_names
            scol_name = left._internal.spark_column_name_for(
                left._internal.column_labels[0])
            # Compare the values of left and right by using zip_with function.
            cond = F.zip_with(
                collected_structed_scol,
                right_values_scol,
                lambda x, y: F.struct(
                    *[
                        x[index_scol_name].alias(index_scol_name)
                        for index_scol_name in index_scol_names
                    ],
                    F.when(x[scol_name].isNull() | y.isNull(), False).
                    otherwise(x[scol_name] == y, ).alias(scol_name),
                ),
            ).alias(scol_name)
            # 1. `sdf_new` here looks like the below (the first field of each set is Index):
            # +----------------------------------------------------------+
            # |0                                                         |
            # +----------------------------------------------------------+
            # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]|
            # +----------------------------------------------------------+
            sdf_new = sdf.select(cond)
            # 2. `sdf_new` after the explode looks like the below:
            # +----------+
            # |       col|
            # +----------+
            # |{0, false}|
            # | {1, true}|
            # |{2, false}|
            # | {3, true}|
            # |{4, false}|
            # +----------+
            sdf_new = sdf_new.select(F.explode(scol_name))
            # 3. Here, the final `sdf_new` looks like the below:
            # +-----------------+-----+
            # |__index_level_0__|    0|
            # +-----------------+-----+
            # |                0|false|
            # |                1| true|
            # |                2|false|
            # |                3| true|
            # |                4|false|
            # +-----------------+-----+
            sdf_new = sdf_new.select("col.*")

            index_spark_columns = [
                scol_for(sdf_new, index_scol_name)
                for index_scol_name in index_scol_names
            ]
            data_spark_columns = [scol_for(sdf_new, scol_name)]

            internal = left._internal.copy(
                spark_frame=sdf_new,
                index_spark_columns=index_spark_columns,
                data_spark_columns=data_spark_columns,
                index_fields=[
                    InternalField.from_struct_field(index_field)
                    for index_field in sdf_new.select(
                        index_spark_columns).schema.fields
                ],
                data_fields=[
                    InternalField.from_struct_field(
                        sdf_new.select(data_spark_columns).schema.fields[0])
                ],
            )
            return first_series(DataFrame(internal))
        else:
            from pyspark.pandas.base import column_op

            return column_op(Column.__eq__)(left, right)
예제 #3
0
    def main(self, sc: SparkContext, *args: Any):
        # Drop statistical results from the gene bundle
        # Create an experimental data collection with the observations
        observations_parquet_path = args[0]
        genotype_phenotype_parquet_path = args[1]
        impc_images_parquet_path = args[2]
        product_parquet_path = args[3]

        stats_results_parquet_path = args[4]
        stats_results_raw_data_parquet_path = f"{stats_results_parquet_path}_raw_data"
        gene_core_parquet_path = args[5]
        output_path = args[6]
        spark = SparkSession(sc)

        observations_df = spark.read.parquet(observations_parquet_path)
        genotype_phenotype_df = spark.read.parquet(
            genotype_phenotype_parquet_path)
        impc_images_df = spark.read.parquet(impc_images_parquet_path)
        product_df = spark.read.parquet(product_parquet_path)
        gene_df: DataFrame = spark.read.parquet(gene_core_parquet_path)
        gene_df = gene_df.drop("datasets_raw_data")
        stats_results_df = spark.read.parquet(stats_results_parquet_path)

        impc_images_df = impc_images_df.withColumnRenamed(
            "gene_accession_id", "mgi_accession_id")

        images_by_gene_df = impc_images_df.groupBy("mgi_accession_id").agg(
            collect_set(
                struct(*[
                    col_name for col_name in impc_images_df.columns
                    if col_name != "mgi_accession_id"
                ])).alias("gene_images"))
        gene_df = gene_df.join(images_by_gene_df, "mgi_accession_id",
                               "left_outer")

        products_by_gene = product_df.groupBy("mgi_accession_id").agg(
            collect_set(
                struct(*[
                    col_name for col_name in product_df.columns
                    if col_name not in ["mgi_accession_id"] +
                    EXCLUDE_PRODUCT_COLUMNS
                ])).alias("gene_products"))
        gene_df = gene_df.join(products_by_gene, "mgi_accession_id",
                               "left_outer")

        stats_results_by_gene = stats_results_df.groupBy(
            "marker_accession_id").agg(
                collect_set("doc_id").alias("statistical_result_ids"))
        gene_df = gene_df.join(
            stats_results_by_gene,
            col("mgi_accession_id") == col("marker_accession_id"),
            "left_outer",
        )

        parameters_by_gene = observations_df.select(
            "gene_accession_id",
            "pipeline_stable_id",
            "pipeline_name",
            "procedure_stable_id",
            "procedure_name",
            "parameter_stable_id",
            "parameter_name",
        ).distinct()

        parameters_by_gene = parameters_by_gene.groupBy(
            "gene_accession_id").agg(
                collect_set(
                    struct(
                        "pipeline_stable_id",
                        "pipeline_name",
                        "procedure_stable_id",
                        "procedure_name",
                        "parameter_stable_id",
                        "parameter_name",
                    )).alias("tested_parameters"))
        parameters_by_gene = parameters_by_gene.withColumnRenamed(
            "gene_accession_id", "mgi_accession_id")
        gene_df = gene_df.join(parameters_by_gene, "mgi_accession_id",
                               "left_outer")

        gene_df = gene_df.withColumn("_id", col("mgi_accession_id"))
        genotype_phenotype_df = genotype_phenotype_df.withColumnRenamed(
            "marker_accession_id", "mgi_accession_id")
        gp_by_gene_df = genotype_phenotype_df.groupBy("mgi_accession_id").agg(
            collect_set(
                struct(*[
                    col_name for col_name in genotype_phenotype_df.columns
                    if col_name != "mgi_accession_id"
                ])).alias("gene_phenotype_associations"))

        gene_vs_phenotypes_df = gene_df.join(gp_by_gene_df, "mgi_accession_id",
                                             "left_outer")
        # self.write_to_mongo(
        #     gene_vs_phenotypes_df,
        #     "org.mousephenotype.api.models.GeneBundle",
        #     "gene_bundles",
        # )

        # Create search_index
        gp_mp_term_structured = genotype_phenotype_df.withColumn(
            "significant_mp_term",
            struct(
                "mp_term_id",
                "mp_term_name",
                zip_with(
                    "intermediate_mp_term_id",
                    "intermediate_mp_term_name",
                    lambda x, y: struct(x.alias("mp_term_id"),
                                        y.alias("mp_term_name")),
                ).alias("intermediate_ancestors"),
                zip_with(
                    "top_level_mp_term_id",
                    "top_level_mp_term_name",
                    lambda x, y: struct(x.alias("mp_term_id"),
                                        y.alias("mp_term_name")),
                ).alias("top_level_ancestors"),
            ).alias("significant_mp_term"),
        )
        gp_mp_term_structured = gp_mp_term_structured.select(
            "mgi_accession_id", "significant_mp_term")
        gp_mp_term_structured_gene_df = gp_mp_term_structured.groupBy(
            "mgi_accession_id").agg(
                collect_set("significant_mp_term").alias(
                    "significant_mp_terms"))

        gene_search_df = gene_df.join(gp_mp_term_structured_gene_df,
                                      "mgi_accession_id", "left_outer")
        gene_search_df = gene_search_df.select(
            col("mgi_accession_id").alias("_id"),
            "mgi_accession_id",
            "marker_name",
            "human_gene_symbol",
            "marker_synonym",
            "assignment_status",
            "crispr_allele_production_status",
            "es_cell_production_status",
            "mouse_production_status",
            "phenotype_status",
            "phenotyping_data_available",
            "tested_parameters",
            col("significant_top_level_mp_terms").alias(
                "significant_phenotype_system"),
            col("not_significant_top_level_mp_terms").alias(
                "non_significant_phenotype_system"),
            "significant_mp_terms",
        )
        self.write_to_mongo(
            gene_search_df,
            "org.mousephenotype.api.models.Gene",
            "gene_search",
        )
        # self.write_to_mongo(
        #     observations_df,
        #     "org.mousephenotype.api.models.Observation",
        #     "experimental_data",
        # )
        stats_results_df = stats_results_df.withColumnRenamed(
            "doc_id", "statistical_result_id")
        stats_results_df = stats_results_df.withColumn(
            "_id", col("statistical_result_id"))
        # self.write_to_mongo(
        #     stats_results_df,
        #     "org.mousephenotype.api.models.StatisticalResult",
        #     "statistical_results",
        # )
        gene_vs_phenotypes_df.write.parquet(output_path)