Exemplo n.º 1
0
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'):
    add_ids = '##add_ids'
    del_ids = '##del_ids'
    upd_ids = '##upd_ids'

    c = set(df.columns).difference({state_col, updated_col})
    colnames = [x for x in df.columns if x in c]

    on = merge_on or colnames
    on = on if isinstance(on, (list, tuple)) else [on]
    on = [c for c in on if c in colnames]

    s = on + [state_col, updated_col]
    cols = [x for x in df.columns if x not in s]

    a = df.filter(f'{state_col} = 0') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \
        .select(updated_col, add_ids)

    d = df.filter(f'{state_col} = 1') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \
        .select(updated_col, del_ids)

    res = a.join(d, on=updated_col, how='outer')
    res = res.select(updated_col,
                     F.coalesce(add_ids, F.array([])).alias(add_ids),
                     F.coalesce(del_ids, F.array([])).alias(del_ids))

    if cols:
        agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols]
        cnt = df.groupby(*on, updated_col).agg(*agg_funcs)

        agg_names = [F.lit(x) for x in cols]
        agg_sums = [F.sum(x) for x in cols]
        cnt = cnt.groupby(updated_col).agg(
            F.map_from_arrays(F.array(*agg_names),
                              F.array(*agg_sums)).alias('changes'))

        res = res.join(cnt, on=updated_col)
    else:
        res = res.withColumn('changes', F.lit(None))

    res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids))
    res = res.select(
        F.col(updated_col).alias('updated'),
        F.size(upd_ids).alias('upd'),
        F.size(F.array_except(add_ids, upd_ids)).alias('add'),
        F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes')

    return res.orderBy('updated')
Exemplo n.º 2
0
    def _get_significance_fields_by_gene(self, stats_results_df):
        significant_mp_term = stats_results_df.select(
            "gene_accession_id", "top_level_mp_term_name", "significant"
        )

        significant_mp_term = significant_mp_term.withColumn(
            "top_level_mp_term_name", functions.explode("top_level_mp_term_name")
        )

        significant_mp_term = significant_mp_term.groupBy("gene_accession_id").agg(
            functions.collect_set(
                functions.when(
                    functions.col("significant") == True,
                    functions.col("top_level_mp_term_name"),
                ).otherwise(functions.lit(None))
            ).alias("significant_top_level_mp_terms"),
            functions.collect_set(
                functions.when(
                    functions.col("significant") == False,
                    functions.col("top_level_mp_term_name"),
                ).otherwise(functions.lit(None))
            ).alias("not_significant_top_level_mp_terms"),
        )
        significant_mp_term = significant_mp_term.withColumn(
            "not_significant_top_level_mp_terms",
            functions.array_except(
                "not_significant_top_level_mp_terms", "significant_top_level_mp_terms"
            ),
        )
        significant_mp_term = significant_mp_term.withColumnRenamed(
            "gene_accession_id", "mgi_accession_id"
        )
        return significant_mp_term
Exemplo n.º 3
0
    def _init_csv_df(self) -> DataFrame:
        NULL_FIELD_EXCLUDE_ARRAY = F.array(
            F.struct(
                F.lit("").alias("field"),
                F.lit("").alias("value"),
                F.lit("").alias("units"),
            ),
            F.struct(
                F.lit(None).alias("field"),
                F.lit(None).alias("value"),
                F.lit(None).alias("units"),
            ),
        )

        assert self._process_fit2csv()

        base_df = (self._spark.read.csv(
            f"{self.csv_path}", header=True).withColumnRenamed(
                "Type", "record_type").where(F.col("record_type") == "Data"))
        all_fields = F.array(
            self._get_fit_field_structs(base_df)).alias("all_fields")
        fields = F.array_except(all_fields,
                                NULL_FIELD_EXCLUDE_ARRAY).alias("fields")
        return base_df.select(
            *[F.col(col).alias(col.lower()) for col in RECORD_ATTRS], fields)
Exemplo n.º 4
0
def to_user_product_pairs(products_by_customer_df: DataFrame,
                          indexed_user_items: DataFrame) -> DataFrame:
    """
    create item positive and negative pairs.

    Args:
        products_by_customer_df (DataFrame):
        +-----------+--------------------+
        |customer_id|            products|
        +-----------+--------------------+
        |   10007421|[12537, 27265, 32...|
        |   10036418|[46420, 34635, 27...|
        |   10058663|[42536, 42984, 37...|
        |   10083340|[34617, 45656, 42...|
        |   10108034|[43418, 46650, 42...|
        +-----------+--------------------+
        products_index (DataFrame): product id and product index mapping.

    Returns:
        DataFrame:
        root
            |-- <partition_column> customer_id: string (nullable = true)
            |-- positives: array (nullable = true)
            |    |-- element: integer (containsNull = false)

    """
    products_collection = indexed_user_items.agg(
        F.collect_set("product_id_index").alias("full_product_index"), )
    positive_and_neg = (products_by_customer_df.crossJoin(
        F.broadcast(products_collection)).withColumn(
            "negative_ids",
            F.array_except("full_product_index", "positives_ids")).select(
                F.col("customer_id"),
                F.col("customer_id_index"),
                F.col("cross_bin_number"),
                F.col("positives_ids").alias("positives"),
                F.expr("slice(shuffle(negative_ids), 1, size(positives_ids))").
                alias("negatives"),
            ))
    return positive_and_neg
Exemplo n.º 5
0
def to_train_triplet(train_df, test_df, products_index):
    positives_df = train_df.join(test_df.select(
        'customer_id',
        F.col('products').alias('tests')),
                                 on=['customer_id']).select(
                                     'customer_id',
                                     'products',
                                     F.concat('products',
                                              'tests').alias('positives'),
                                 )

    index_set = products_index.select(
        F.collect_set('product_id_index').alias('idx_set'))

    train_triplet = positives_df.crossJoin(index_set).withColumn(
        'all_negatives',
        F.shuffle(F.array_except(
            F.col('idx_set'), F.col('positives')))).select(
                F.col('customer_id'), F.col('products'),
                F.expr("slice(all_negatives, 1, size(products))").alias(
                    'negatives'))

    return train_triplet
Exemplo n.º 6
0
def to_train_triplet(train_df, test_df, products_index):
    positives_df = train_df.join(
        test_df.select("customer_id",
                       F.col("products").alias("tests")),
        on=["customer_id"],
    ).select(
        "customer_id",
        "products",
        F.concat("products", "tests").alias("positives"),
    )

    index_set = products_index.select(
        F.collect_set("product_id_index").alias("idx_set"))

    train_triplet = (positives_df.crossJoin(index_set).withColumn(
        "all_negatives",
        F.shuffle(F.array_except(F.col("idx_set"), F.col("positives"))),
    ).select(
        F.col("customer_id"),
        F.col("products"),
        F.expr("slice(all_negatives, 1, size(products))").alias("negatives"),
    ))

    return train_triplet
def get_timeseries(spark, dates, path_prefix='/user/s1840495/WebInsight/'):
    """
    Gets the initial pages and calculates diffs between all next dates for that page.
    This results in a flow of that page and its behavior.
    """
    print("Time-series using: " + str(dates) + ".")

    diffInternalOutLinkColumns = []
    diffExternalOutLinkColumns = []

    timeseries = None
    for date in dates:
        path = CreatePath(date[0], date[1], date[2], path_prefix)
        date_str = '{0}-{1:02d}-{2:02d}'.format(date[0], date[1], date[2])
        if timeseries is None:
            # This is the first one, let's initialize the start.
            timeseries = spark.read.parquet(path) \
                .where(col('url').isNotNull()) \
                .where(col('fetch_contentLength') > 0) \
                .where(col('fetchMon').isNotNull()) \
                .where(col('fetch_semanticVector').isNotNull()) \
                .select(
                    'url',
                    'fetchMon',
                    'fetchDay',
                    col('internalOutLinks.targetUrl').alias('internalOutLinks'),
                    col('externalOutLinks.targetUrl').alias('externalOutLinks')
                    ) \
                .where(col('fetchMon') == date[1]) \
                .where(col('fetchDay') == date[2]) \
                .drop('fetchMon') \
                .drop('fetchDay')
            timeseries.show()

            continue

        # We have one point in the future, let's create a data point.
        new_df = spark.read.parquet(path) \
            .where(col('fetchMon').isNotNull()) \
            .select(
                'url',
                'fetchMon',
                'fetchDay',
                col('internalOutLinks.targetUrl').alias('newInternalOutLinks'),
                col('externalOutLinks.targetUrl').alias('newExternalOutLinks')
                ) \
            .where(col('fetchMon') == date[1]) \
            .where(col('fetchDay') == date[2]) \
            .drop('fetchMon') \
            .drop('fetchDay')
        print("---: " + date_str + " :----")
        new_df.show()

        print("Starting join!")

        diffInternalOutLinkColumn = date_str + '_diffInternalOutLinks'
        diffInternalOutLinkColumns.append(diffInternalOutLinkColumn)
        diffExternalOutLinkColumn = date_str + '_diffExternalOutLinks'
        diffExternalOutLinkColumns.append(diffExternalOutLinkColumn)

        # Join the two dataframes.
        # If the new data does not contain information about this link, we keep the old internal/external-out link information.
        timeseries = timeseries.join(new_df, on='url', how='inner') \
                        .withColumn(diffExternalOutLinkColumn, size(array_except("newExternalOutLinks", "externalOutLinks"))) \
                        .withColumn(diffInternalOutLinkColumn, size(array_except("newInternalOutLinks", "internalOutLinks"))) \
                        .withColumn('externalOutLinks', col('newExternalOutLinks')) \
                        .drop('newExternalOutLinks') \
                        .withColumn('internalOutLinks', col('newInternalOutLinks')) \
                        .drop('newInternalOutLinks')
        # .withColumn('externalOutLinks', when(col('newExternalOutLinks').isNull(), col('externalOutLinks')).otherwise(col('newExternalOutLinks'))) \
        # .withColumn('internalOutLinks', when(col('newInternalOutLinks').isNull(), col('internalOutLinks')).otherwise(col('newInternalOutLinks'))) \
        # .drop('newExternalOutLinks') \
        # .drop('newInternalOutLinks')
        print("Timeseries count: " + str(timeseries.count()))
        timeseries.show()

    # Cleanup. Merge columns together to arrays.
    timeseries = timeseries.select(
        'url',
        array(*diffInternalOutLinkColumns).alias("diffInternalOutLinks"),
        array(*diffExternalOutLinkColumns).alias("diffExternalOutLinks"))

    print("Finished creation of timeseries.")
    return timeseries
Exemplo n.º 8
0
    def _get_datasets_by_gene(
        self, stats_results_df, observations_df, ontology_metadata_df, compress=True
    ):
        significance_cols = [
            "female_ko_effect_p_value",
            "male_ko_effect_p_value",
            "genotype_effect_p_value",
            "male_pvalue_low_vs_normal_high",
            "male_pvalue_low_normal_vs_high",
            "female_pvalue_low_vs_normal_high",
            "female_pvalue_low_normal_vs_high",
            "genotype_pvalue_low_normal_vs_high",
            "genotype_pvalue_low_vs_normal_high",
            "male_ko_effect_p_value",
            "female_ko_effect_p_value",
            "p_value",
            "effect_size",
            "male_effect_size",
            "female_effect_size",
            "male_effect_size_low_vs_normal_high",
            "male_effect_size_low_normal_vs_high",
            "genotype_effect_size_low_vs_normal_high",
            "genotype_effect_size_low_normal_vs_high",
            "female_effect_size_low_vs_normal_high",
            "female_effect_size_low_normal_vs_high",
            "significant",
            "full_mp_term",
            "metadata_group",
            "male_mutant_count",
            "female_mutant_count",
            "statistical_method",
            "mp_term_id",
            "top_level_mp_term_id",
            "top_level_mp_term_name",
            "sex",
        ]

        data_set_cols = [
            "allele_accession_id",
            "allele_symbol",
            "gene_symbol",
            "gene_accession_id",
            "parameter_stable_id",
            "parameter_name",
            "procedure_stable_id",
            "procedure_name",
            "pipeline_name",
            "pipeline_stable_id",
            "zygosity",
            "phenotyping_center",
            "life_stage_name",
        ]

        stats_results_df = stats_results_df.select(*(data_set_cols + significance_cols))
        stats_results_df = stats_results_df.withColumn(
            "selected_p_value",
            functions.when(
                functions.col("statistical_method").isin(
                    ["Manual", "Supplied as data"]
                ),
                functions.col("p_value"),
            )
            .when(
                functions.col("statistical_method").contains("Reference Range Plus"),
                functions.when(
                    functions.col("sex") == "male",
                    functions.least(
                        functions.col("male_pvalue_low_vs_normal_high"),
                        functions.col("male_pvalue_low_normal_vs_high"),
                    ),
                )
                .when(
                    functions.col("sex") == "female",
                    functions.least(
                        functions.col("female_pvalue_low_vs_normal_high"),
                        functions.col("female_pvalue_low_normal_vs_high"),
                    ),
                )
                .otherwise(
                    functions.least(
                        functions.col("genotype_pvalue_low_normal_vs_high"),
                        functions.col("genotype_pvalue_low_vs_normal_high"),
                    )
                ),
            )
            .otherwise(
                functions.when(
                    functions.col("sex") == "male",
                    functions.col("male_ko_effect_p_value"),
                )
                .when(
                    functions.col("sex") == "female",
                    functions.col("female_ko_effect_p_value"),
                )
                .otherwise(
                    functions.when(
                        functions.col("statistical_method").contains(
                            "Fisher Exact Test framework"
                        ),
                        functions.col("p_value"),
                    ).otherwise(functions.col("genotype_effect_p_value"))
                )
            ),
        )
        stats_results_df = stats_results_df.withColumn(
            "selected_p_value", functions.col("selected_p_value").cast(DoubleType())
        )
        stats_results_df = stats_results_df.withColumn(
            "selected_effect_size",
            functions.when(
                functions.col("statistical_method").isin(
                    ["Manual", "Supplied as data"]
                ),
                functions.lit(1.0),
            )
            .when(
                ~functions.col("statistical_method").contains("Reference Range Plus"),
                functions.when(
                    functions.col("sex") == "male", functions.col("male_effect_size")
                )
                .when(
                    functions.col("sex") == "female",
                    functions.col("female_effect_size"),
                )
                .otherwise(functions.col("effect_size")),
            )
            .otherwise(
                functions.when(
                    functions.col("sex") == "male",
                    functions.when(
                        functions.col("male_effect_size_low_vs_normal_high")
                        <= functions.col("male_effect_size_low_normal_vs_high"),
                        functions.col("genotype_effect_size_low_vs_normal_high"),
                    ).otherwise(
                        functions.col("genotype_effect_size_low_normal_vs_high")
                    ),
                )
                .when(
                    functions.col("sex") == "female",
                    functions.when(
                        functions.col("female_effect_size_low_vs_normal_high")
                        <= functions.col("female_effect_size_low_normal_vs_high"),
                        functions.col("genotype_effect_size_low_vs_normal_high"),
                    ).otherwise(
                        functions.col("genotype_effect_size_low_normal_vs_high")
                    ),
                )
                .otherwise(functions.col("effect_size"))
            ),
        )
        stats_results_df = stats_results_df.withColumn(
            "selected_phenotype_term", functions.col("mp_term_id")
        )
        observations_df = observations_df.select(*data_set_cols).distinct()
        datasets_df = observations_df.join(
            stats_results_df, data_set_cols, "left_outer"
        )
        datasets_df = datasets_df.groupBy(data_set_cols).agg(
            functions.collect_set(
                functions.struct(
                    *[
                        "selected_p_value",
                        "selected_effect_size",
                        "selected_phenotype_term",
                        "metadata_group",
                        "male_mutant_count",
                        "female_mutant_count",
                        "significant",
                        "top_level_mp_term_id",
                        "top_level_mp_term_name",
                    ]
                )
            ).alias("stats_data")
        )
        datasets_df = datasets_df.withColumn(
            "successful_stats_data",
            functions.expr(
                "filter(stats_data, stat -> stat.selected_p_value IS NOT NULL)"
            ),
        )
        datasets_df = datasets_df.withColumn(
            "stats_data",
            functions.when(
                functions.size("successful_stats_data") > 0,
                functions.sort_array("successful_stats_data").getItem(0),
            ).otherwise(functions.sort_array("stats_data").getItem(0)),
        )
        datasets_df = datasets_df.select(*data_set_cols, "stats_data.*")
        datasets_df = datasets_df.withColumnRenamed("selected_p_value", "p_value")
        datasets_df = datasets_df.withColumnRenamed(
            "selected_effect_size", "effect_size"
        )
        datasets_df = datasets_df.withColumnRenamed(
            "selected_phenotype_term", "phenotype_term_id"
        )
        datasets_df = datasets_df.withColumnRenamed(
            "top_level_mp_term_id", "top_level_phenotype_term_id"
        )

        datasets_df = datasets_df.withColumn(
            "top_level_mp_term_name",
            array_except(
                col("top_level_mp_term_name"), array(lit(None).cast("string"))
            ),
        )

        datasets_df = datasets_df.withColumnRenamed(
            "top_level_mp_term_name", "top_level_phenotype_term_name"
        )
        datasets_df = datasets_df.join(
            ontology_metadata_df, "phenotype_term_id", "left_outer"
        )
        datasets_df = datasets_df.withColumn(
            "significance",
            functions.when(
                functions.col("significant") == True, functions.lit("Significant")
            )
            .when(
                functions.col("p_value").isNotNull(), functions.lit("Not significant")
            )
            .otherwise(functions.lit("N/A")),
        )
        mgi_datasets_df = datasets_df.groupBy("gene_accession_id").agg(
            functions.collect_set(
                functions.struct(
                    *(
                        data_set_cols
                        + [
                            "significance",
                            "p_value",
                            "effect_size",
                            "metadata_group",
                            "male_mutant_count",
                            "female_mutant_count",
                            "phenotype_term_id",
                            "phenotype_term_name",
                            "top_level_phenotype_term_id",
                            "top_level_phenotype_term_name",
                        ]
                    )
                )
            ).alias("datasets_raw_data")
        )

        mgi_datasets_df = mgi_datasets_df.withColumnRenamed(
            "gene_accession_id", "mgi_accession_id"
        )

        if compress:
            to_json_udf = functions.udf(
                lambda row: None
                if row is None
                else json.dumps(
                    [
                        {key: value for key, value in item.asDict().items()}
                        for item in row
                    ]
                ),
                StringType(),
            )
            mgi_datasets_df = mgi_datasets_df.withColumn(
                "datasets_raw_data", to_json_udf("datasets_raw_data")
            )
            compress_and_encode = functions.udf(self._compress_and_encode, StringType())
            mgi_datasets_df = mgi_datasets_df.withColumn(
                "datasets_raw_data", compress_and_encode("datasets_raw_data")
            )
        return mgi_datasets_df