Exemplo n.º 1
0
def gen_summary(df, output_prefix=""):
    summary = {}

    string_cols = []
    boolean_cols = []
    numeric_cols = []
    other_cols = []

    for field in df.schema.fields:
        if isinstance(field.dataType, T.StringType):
            string_cols.append(field.name)
        elif isinstance(field.dataType, T.BooleanType):
            boolean_cols.append(field.name)
        elif isnumeric(field.dataType):
            numeric_cols.append(field.name)
        else:
            other_cols.append(field.name)

    counts = cardinalities(df, string_cols)
    uniques = likely_unique(counts)
    categoricals = unique_values(df, likely_categoricals(counts))

    for span in [2, 3, 4, 6, 12]:
        thecube = df.cube(
            "Churn",
            F.ceil(df.tenure / span).alias("%d_month_spans" % span), "gender",
            "Partner", "SeniorCitizen", "Contract", "PaperlessBilling",
            "PaymentMethod",
            F.ceil(F.log2(F.col("MonthlyCharges")) *
                   10).alias("log_charges")).count()
        therollup = df.rollup(
            "Churn",
            F.ceil(df.tenure / span).alias("%d_month_spans" % span),
            "SeniorCitizen", "Contract", "PaperlessBilling", "PaymentMethod",
            F.ceil(F.log2(F.col("MonthlyCharges")) *
                   10).alias("log_charges")).agg(
                       F.sum(F.col("TotalCharges")).alias("sum_charges"))
        thecube.write.mode("overwrite").parquet("%scube-%d.parquet" %
                                                (output_prefix, span))
        therollup.write.mode("overwrite").parquet("%srollup-%d.parquet" %
                                                  (output_prefix, span))

    encoding_struct = {
        "categorical": categoricals,
        "numeric": numeric_cols + boolean_cols,
        "unique": uniques
    }

    summary["schema"] = df.schema.jsonValue()
    summary["ecdfs"] = approx_ecdf(df, numeric_cols)
    summary["true_percentage"] = percent_true(df, boolean_cols)
    summary["encoding"] = encoding_struct
    summary["distinct_customers"] = df.select(df.customerID).distinct().count()

    return summary
Exemplo n.º 2
0
def _simple_entropy(df: pyspark.sql.dataframe.DataFrame, column_name: str) -> float:
    count = df.count()
    testdf = df.select(column_name).groupby(column_name).agg((F.count(column_name) / count).alias("p"))
    result = testdf.groupby().agg(-F.sum(F.col("p") * F.log2("p"))).collect()[0][0]
    if not result:
        return 0.0
    return result
Exemplo n.º 3
0
    def distributional_coverage(self):
        """Calculate distributional coverage for recommendations across all users.
        The metric definition is based on formula (21) in the following reference:

        :Citation:

            G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
            Recommender Systems Handbook pp. 257-297, 2010.

        Returns:
            float: distributional coverage
        """
        # In reco_df, how  many times each col_item is being recommended
        df_itemcnt_reco = self.reco_df.groupBy(self.col_item).count()

        # the number of total recommendations
        count_row_reco = self.reco_df.count()
        df_entropy = df_itemcnt_reco.withColumn(
            "p(i)",
            F.col("count") / count_row_reco).withColumn(
                "entropy(i)",
                F.col("p(i)") * F.log2(F.col("p(i)")))
        # distributional coverage
        d_coverage = -df_entropy.agg(F.sum("entropy(i)")).collect()[0][0]

        return d_coverage
Exemplo n.º 4
0
    def historical_item_novelty(self):
        """Calculate novelty for each item. Novelty is computed as the minus logarithm of
        (number of interactions with item / total number of interactions). The definition of the metric
        is based on the following reference using the choice model (eqs. 1 and 6):

        :Citation:

            P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
            choice, discovery and relevance, ECIR 2011

        The novelty of an item can be defined relative to a set of observed events on the set of all items.
        These can be events of user choice (item "is picked" by a random user) or user discovery
        (item "is known" to a random user). The above definition of novelty reflects a factor of item popularity.
        High novelty values correspond to long-tail items in the density function, that few users have interacted
        with and low novelty values correspond to popular head items.

        Returns:
            pyspark.sql.dataframe.DataFrame: A dataframe with the following columns: col_item, item_novelty.
        """
        if self.df_item_novelty is None:
            n_records = self.train_df.count()
            self.df_item_novelty = (self.train_df.groupBy(
                self.col_item).count().withColumn(
                    "item_novelty",
                    -F.log2(F.col("count") / n_records)).select(
                        self.col_item, "item_novelty").orderBy(self.col_item))
        return self.df_item_novelty
def preprocessing(spark_df):
    smart_feature_columns=[column for column in spark_df.columns if 'smart' in column]


    window_spec_7 = Window.partitionBy('model', 'serial_number').orderBy(
        F.datediff(F.col('dt'), F.lit('2017-07-01'))).rangeBetween(-7, 0)
    prefix_window7='window_7_'
    for smart_col in smart_feature_columns:
        spark_df=spark_df.withColumn(smart_col,F.col(smart_col).cast(DoubleType()))
        if smart_col in ['smart_1_normalized','smart_5raw','smart_7_normalized','smart_194raw','smart_199raw',
                         'smart_190raw','smart_191raw','smart_193raw','smart_195_normalized','smart_195raw']:
            spark_df = spark_df.withColumn(prefix_window7 + 'range_' + smart_col,
                                         F.max(F.col(smart_col)).over(window_spec_7) - F.min(F.col(smart_col)).over(
                                             window_spec_7))
            spark_df = spark_df.withColumn(prefix_window7 + 'std_' + smart_col,
                                         F.stddev(F.col(smart_col)).over(window_spec_7))
        #if smart_col in ['smart_187raw','smart_188raw','smart_197raw','smart_198raw']:
        #    spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)>0,1).otherwise(0))
        #if smart_col in ['smart_187_normalized','smart_188_normalized','smart_197_normalized','smart_198_normalized']:
        #    spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)<100,1).otherwise(0))
        if smart_col in ['smart_4raw','smart_5raw','smart_191raw',
                         'smart_187raw','smart_197raw','smart_198raw',
                         'smart_199raw','window_7_range_smart_199raw']:
            spark_df=spark_df.withColumn(smart_col,F.log2(F.col(smart_col)+F.lit(1.)))

    spark_df=spark_df.withColumn('smart_199raw',F.col('smart_199raw')*F.col('window_7_range_smart_199raw'))

    spark_df = spark_df.withColumn('anomaly_sum',
                                   F.col('smart_4raw') / 12 + F.col('smart_5raw') / 16  + F.col('smart_191raw') / 18
                                    + F.col('smart_198raw')/18 +F.col('smart_197raw')/18+F.col('smart_187raw')/15)

    return spark_df
Exemplo n.º 6
0
    def evaluate(self,
                 req: List[privacy.Auxiliary],
                 N=2,
                 similarity="general",
                 mode="best-guess",
                 with_movie=True,
                 tol=15):
        """De-anonymisation evaluator

        Given a list of Auxiliary requests and a number of sampled customers, evaluate
        de-anonymisation performance. There are two modes:
        - 'best-guess': returns true positive rate for a fixed threshold. 
        - 'entropic': returns the entropy of the probability distribution.
        """
        scoring = self.get_scoring(similarity, with_movie)
        aux = self.generate_auxiliary_data(req, N)
        scores = self.compute_score(aux, similarity, with_movie, tol)

        if mode == "best-guess":
            match = scoring.matching_set(scores, 0.5)
            return 100 * match.filter("custId_1 == custId_2").count() / N
        elif mode == "entropic":
            probas = scoring.output(scores, mode="entropic")
            withEntropy = probas.groupBy("custId_1").agg((-F.sum(
                F.col("probas") * F.log2(F.col("probas")))).alias("entropy"))
            return withEntropy.groupBy().avg('entropy').collect()
        else:
            raise "Invalid argument."
Exemplo n.º 7
0
def _entropy_todo(column, df):
    """
    Returns what (columns, as in spark columns) to compute to get the results requested by
    the parameters.

    :param column:
    :type column: str/int
    :param df:
    :type df: DataFrame
    :return: Pyspark columns representing what to compute.
    """
    # group on that column
    todo = df.groupBy(column)

    # count instances of each group
    todo = todo.agg(count("*").alias("_entropy_ci"))
    # ignore nans/null for computing entropy
    todo = todo.filter(~col(column).isNull())
    todo = todo.select(
        sum(col("_entropy_ci") * log2("_entropy_ci")).alias("_sumcilogci"),
        sum("_entropy_ci").alias("_total"))
    todo = todo.select(
        log2(col("_total")) - col("_sumcilogci") / col("_total"))
    return todo
Exemplo n.º 8
0
    def evaluate_all(self,
                     req: List[privacy.Auxiliary],
                     N=100,
                     similarity="general",
                     mode="best-guess",
                     with_movie=True,
                     tol=15):
        scoring = self.get_scoring(similarity, with_movie)
        aux = self.generate_auxiliary_data(req, N)
        scores = self.compute_score(aux, similarity, with_movie, tol)
        custIds = aux.custId.unique()

        if mode == "best-guess":  # {aux, custId, score, excentricity }
            match = scoring.matching_set(scores,
                                         0.0).toPandas().set_index("custId_1")
            return [{
                "id": custId,
                "aux": aux.set_index("custId").loc[custId],
                "matchedId": int(match.loc[custId]["custId_2"]),
                "score": match.loc[custId]["value_1"],
                "eccentricity": match.loc[custId]["eccentricity"],
            } for custId in custIds]
        elif mode == "entropic":
            scores.cache()
            probas = scoring.output(scores, mode="entropic")
            match = scoring.matching_set(scores,
                                         0.0).toPandas().set_index("custId_1")

            withEntropy = probas.groupBy("custId_1").agg(
                (-F.sum(F.col("probas") * F.log2(F.col("probas")))
                 ).alias("entropy")).toPandas().set_index("custId_1")

            return [{
                "id": custId,
                "aux": aux.set_index("custId").loc[custId],
                "matchedId": int(match.loc[custId]["custId_2"]),
                "score": match.loc[custId]["value_1"],
                "eccentricity": match.loc[custId]["eccentricity"],
                "entropy": withEntropy.loc[custId]
            } for custId in custIds]
        else:
            raise "Invalid argument."
Exemplo n.º 9
0
def _weighted_entropy(
    countdf: pyspark.sql.dataframe.DataFrame, total_count: int, split_columns: Optional[List[str]], target_column_name: str, weighted: bool = True
) -> float:
    """Entropy calculation across many ."""
    split_columns_plus_target = split_columns[:]
    split_columns_plus_target.append(target_column_name)
    groupdf = countdf.groupby(split_columns_plus_target).agg(F.sum("count").alias("group_count"))

    w = Window.partitionBy(split_columns)
    groupdf = groupdf.withColumn("p", F.col("group_count") / F.sum(groupdf["group_count"]).over(w)).withColumn(
        "weight", F.sum(groupdf["group_count"] / total_count).over(w)
    )

    entropydf = groupdf.groupby(split_columns).agg(
        (-F.sum(F.col("p") * F.log2("p"))).alias("entropy"), (F.sum(F.col("group_count") / total_count)).alias("weight")
    )

    if weighted:
        result = entropydf.groupby().agg(F.sum(F.col("entropy") * F.col("weight"))).collect()[0][0]
    else:
        result = entropydf.groupby().sum("entropy").collect()[0][0]

    return result
Exemplo n.º 10
0
def compile_log2(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.log2(src_column)
Exemplo n.º 11
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
Exemplo n.º 12
0
    def udaf(self, data):
        """
        Apply median polish to groupBy keys and return value for each sample
        within that grouping.

        This is a hacked/workaround user-defined aggregate function (UDAF) that
        passes the grouped data to
        python to do median polish and return the result back
        to the dataframe.

        :returns: spark dataframe
        """
        # register the medianpolish as a UDF
        medpol = udf(probe_summarization, ArrayType(ArrayType(StringType())))
        # repartition by our grouping keys
        if self.group_keys not in [['TRANSCRIPT_CLUSTER'], ['PROBESET']]:
            raise Exception("Invalid grouping keys.")
        data = data.withColumnRenamed('NORMALIZED_INTENSITY_VALUE', 'VALUE')
        data = data.repartition(self.repartition_number, self.group_keys)

        # log 2 values
        data = data.withColumn('VALUE', log2(data['VALUE']).alias('VALUE'))

        # group the data while concatenating rest of columns into one value
        # so we can pass it to collect, one value(list) per row and a list of
        # lists for the whole grouping, so that we can give it to our UDF as
        # one item which returns back one item (array or arrays)
        data = data.withColumn(
            'data', concat_ws(',', 'SAMPLE', 'PROBE', 'VALUE')) \
            .groupBy(self.group_keys) \
            .agg(collect_list('data')
                 .alias('data')) \
            .withColumn('data', medpol('data'))

        def gen_cols(other_cols):
            """
            Create a list for select().
            select() can take one list, or *args. generating the grouping
            keys as columns and adding other column selections to the same
            list.

            :param other_cols: list of other column selections
            :type other_cols: list

            :returns: single list of columns, expressions, etc. for select()
            """
            cols = [col(s) for s in self.group_keys]
            cols += other_cols
            return cols

        # unpack the first level of nesting vertically, so each array in the
        # array is a new row (per sample)
        data = data.select(
            gen_cols([explode(data['data']).alias("SAMPLEVALUE")]))

        # unpack the final nesting laterally, into two new columns
        data = data.select(
            gen_cols([
                data['SAMPLEVALUE'].getItem(0).alias('SAMPLE'),
                data['SAMPLEVALUE'].getItem(1).alias("VALUE")
            ]))

        data = data.repartition(int(self.num_samples))
        return data