Пример #1
0
def KolmogorovSmirnovTest(sdf, colname, dist='normal', *params):
    """Performs a KolmogorovSmirnov test for comparing the distribution of values in a column
    to a named canonical distribution.
    """
    check_columns(sdf, colname)
    # Supported distributions
    _distributions = [
        'Beta', 'Cauchy', 'ChiSquared', 'Exponential', ' F', 'Gamma', 'Gumbel',
        'Laplace', 'Levy', 'Logistic', 'LogNormal', 'Nakagami', 'Normal',
        'Pareto', 'T', 'Triangular', 'Uniform', 'Weibull'
    ]
    _distlower = list(map(lambda v: v.lower(), _distributions))
    try:
        dist = _distributions[_distlower.index(dist)]
        # the actual name for the Uniform distribution is UniformReal
        if dist == 'Uniform':
            dist += 'Real'
    except ValueError:
        # If we cannot find a distribution, fall back to Normal
        dist = 'Normal'
        params = (0., 1.)
    jvm = sdf._sc._jvm
    # Maps the DF column into a numeric RDD and turns it into Java RDD
    rdd = sdf.notHandy().select(colname).rdd.map(lambda t: t[0])
    jrdd = _py2java(sdf._sc, rdd)
    # Gets the Java class of the corresponding distribution and creates an obj
    java_class = getattr(
        jvm,
        'org.apache.commons.math3.distribution.{}Distribution'.format(dist))
    java_obj = java_class(*params)
    # Loads the KS test class and performs the test
    ks = jvm.org.apache.spark.mllib.stat.test.KolmogorovSmirnovTest
    res = ks.testOneSample(jrdd.rdd(), java_obj)
    return KolmogorovSmirnovTestResult(res)
Пример #2
0
def mutual_info(sdf, colnames):
    check_columns(sdf, colnames)
    n = len(colnames)
    probs = []
    for i in range(n):
        probs.append(distribution(sdf, colnames[i]))
    res = np.zeros(shape=(n, n))
    for i in range(n):
        for j in range(i, n):
            tdf = VectorAssembler(inputCols=[colnames[i], colnames[j]],
                                  outputCol='__vectors').transform(sdf)
            tdf = distribution(tdf, '__vectors')
            tdf = disassemble(dense_to_array(tdf, '__col', '__features'),
                              '__features')
            tdf = tdf.join(probs[i].toDF('__features_0', '__p0'),
                           on='__features_0')
            tdf = tdf.join(probs[j].toDF('__features_1', '__p1'),
                           on='__features_1')
            mi = tdf.select(
                F.sum(
                    F.expr(
                        'log2(__probability / (__p0 * __p1)) * __probability'))
            ).take(1)[0][0]
            res[i, j] = mi
            res[j, i] = mi
    return pd.DataFrame(res, index=colnames, columns=colnames)
Пример #3
0
 def __getitem__(self, *args):
     if isinstance(args[0], tuple):
         args = args[0]
     item = args[0]
     check_columns(self._df, item)
     self._colname = item
     return self
Пример #4
0
def entropy(sdf, colnames):
    colnames = ensure_list(colnames)
    check_columns(sdf, colnames)
    entropy = []
    for colname in colnames:
        entropy.append(
            distribution(sdf, colname).select(
                F.sum(F.expr('-log2(__probability)*__probability'))).take(1)[0]
            [0])
    return pd.Series(entropy, index=colnames)
Пример #5
0
def StatisticalSummaryValues(sdf, colnames):
    """Builds a Java StatisticalSummaryValues object for each column
    """
    colnames = ensure_list(colnames)
    check_columns(sdf, colnames)

    jvm = sdf._sc._jvm
    summ = sdf.notHandy().select(colnames).describe().toPandas().set_index(
        'summary')
    ssvs = {}
    for colname in colnames:
        values = list(map(float, summ[colname].values))
        values = values[1], np.sqrt(values[2]), int(
            values[0]), values[4], values[3], values[0] * values[1]
        java_class = jvm.org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
        ssvs.update({colname: java_class(*values)})
    return ssvs
Пример #6
0
def mahalanobis(sdf, colnames):
    """Computes Mahalanobis distance from origin and compares to critical values
    using Chi-Squared distribution to identify possible outliers.
    """
    check_columns(sdf, colnames)
    # Builds pipeline to assemble feature columns and scale them
    assembler = VectorAssembler(inputCols=colnames, outputCol='__features')
    scaler = StandardScaler(inputCol='__features',
                            outputCol='__scaled',
                            withMean=True)
    pipeline = Pipeline(stages=[assembler, scaler])
    features = pipeline.fit(sdf).transform(sdf)

    # Computes correlation between features and inverts it
    # Since we scaled the features, we can assume they have unit variance
    # and therefore, correlation and covariance matrices are the same!
    mat = Correlation.corr(features, '__scaled').head()[0].toArray()
    inv_mat = inv(mat)

    # Computes critical value
    critical_value = chi2.ppf(0.999, len(colnames))

    # Builds Pandas UDF to compute Mahalanobis distance from origin
    # sqrt((V - 0) * inv_M * (V - 0))
    try:
        import pyarrow

        @F.pandas_udf('double')
        def pudf_mult(v):
            return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v)))
    except:

        @F.udf('double')
        def pudf_mult(v):
            return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v)))

    # Convert feature vector into array
    features = dense_to_array(features, '__scaled', '__array_scaled')
    # Computes Mahalanobis distance and flags as outliers all elements above critical value
    distance = (features.withColumn(
        '__mahalanobis', pudf_mult('__array_scaled')).withColumn(
            '__outlier',
            F.col('__mahalanobis') > critical_value).drop(
                '__features', '__scaled', '__array_scaled'))
    return distance
Пример #7
0
def distribution(sdf, colname):
    check_columns(sdf, colname)
    rdd = sdf.notHandy().select(colname).rdd.map(lambda row: (row[0], 1))
    n = rdd.count()
    return rdd.reduceByKey(add).map(
        lambda t: Row(__col=t[0], __probability=t[1] / n)).toDF()