Пример #1
0
 def test_merge_stats(self):
     stats1 = StatCounter([1.0, 2.0, 3.0, 4.0])
     stats2 = StatCounter([1.0, 2.0, 3.0, 4.0])
     stats = stats1.mergeStats(stats2)
     self.assertEqual(stats.count(), 8)
     self.assertEqual(stats.max(), 4.0)
     self.assertEqual(stats.mean(), 2.5)
     self.assertEqual(stats.min(), 1.0)
     self.assertAlmostEqual(stats.stdev(), 1.118033988749895)
     self.assertAlmostEqual(stats.sampleStdev(), 1.1952286093343936)
     self.assertEqual(stats.sum(), 20.0)
     self.assertAlmostEqual(stats.variance(), 1.25)
     self.assertAlmostEqual(stats.sampleVariance(), 1.4285714285714286)
Пример #2
0
    def stats(self):
        """
        Return a L{StatCounter} object that captures the mean, variance
        and count of the RDD's elements in one operation.
        """
        def redFunc(left_counter, right_counter):
            return left_counter.mergeStats(right_counter)

        return self.mapPartitions(lambda i: [StatCounter(i)]).reduce(redFunc)
Пример #3
0
 def merge(self, frame):
     """
     Add another DataFrame to the accumulated stats for each column.
     Parameters
     ----------
     frame: pandas DataFrame we will update our stats counter with.
     """
     for column_name, _ in self._column_stats.items():
         data_arr = frame[[column_name]].values
         count, min_max_tup, mean, _, _, _ = \
             scistats.describe(data_arr)
         stats_counter = StatCounter()
         stats_counter.n = count
         stats_counter.mu = mean
         stats_counter.m2 = np.sum((data_arr - mean) ** 2)
         stats_counter.minValue, stats_counter.maxValue = min_max_tup
         self._column_stats[column_name] = self._column_stats[
             column_name].mergeStats(stats_counter)
     return self
Пример #4
0
    def test_variance_when_size_zero(self):
        # SPARK-38854: Test case to improve test coverage when
        # StatCounter argument is empty list or None
        arguments = [[], None]

        for arg in arguments:
            stats = StatCounter(arg)
            self.assertTrue(math.isnan(stats.variance()))
            self.assertTrue(math.isnan(stats.sampleVariance()))
            self.assertEqual(stats.count(), 0)
            self.assertTrue(math.isinf(stats.max()))
            self.assertTrue(math.isinf(stats.min()))
            self.assertEqual(stats.mean(), 0.0)
Пример #5
0
 def merge(self, frame):
     """
     Add another DataFrame to the accumulated stats for each column.
     Parameters
     ----------
     frame: pandas DataFrame we will update our stats counter with.
     """
     for column_name, counter in self._column_stats.items():
         data_arr = frame[[column_name]].values
         count, min_max_tup, mean, unbiased_var, skew, kurt = \
             scistats.describe(data_arr)
         stats_counter = StatCounter()
         stats_counter.n = count
         stats_counter.mu = mean
         # TODO(juliet): look up paper they base their streams tat alg on,
         # write docs for statcounter class in spark
         # line below will likely need to be modified to match the alg
         stats_counter.m2 = np.sum((data_arr - mean) ** 2)
         stats_counter.minValue, stats_counter.maxValue = min_max_tup
         self._column_stats[column_name] = self._column_stats[
             column_name].mergeStats(stats_counter)
     return self
Пример #6
0
class NAStatCounter:
    def __init__(self):
        self.stats = StatCounter()
        self.missing = long(0)

    def add(self, x):
        if x is None:
            self.missing += 1
        else:
            self.stats.merge(x)

        return self

    def mergeStats(self, other):

        self.stats.mergeStats(other.stats)
        self.missing += other.missing

        return self

    def __repr__(self):
        return "stats: {0}, NaN: {1}".format(self.stats, self.missing)
Пример #7
0
    def __init__(self, dataframes=[], columns=[]):
        """
        Creates a stats counter for the provided data frames
        computing the stats for all of the columns in columns.
        Parameters
        ----------
        dataframes: list of dataframes, containing the values to compute stats
        on columns: list of strs, list of columns to compute the stats on
        """
        self._column_stats = dict(
            (column_name, StatCounter()) for column_name in columns)

        for df in dataframes:
            self.merge(df)
class NAStatCounter:

    def __init__(self):
        self.stats = StatCounter()
        self.missing = long(0)

    def add(self, x):
        if x is None:
            self.missing += 1
        else:
            self.stats.merge(x)

        return self

    def mergeStats(self, other):

        self.stats.mergeStats(other.stats)
        self.missing += other.missing

        return self

    def __repr__(self):
        return "stats: {0}, NaN: {1}".format(self.stats, self.missing)
Пример #9
0
    def __init__(self, dataframes, columns):
        """
        Creates a stats counter for the provided data frames
        computing the stats for all of the columns in columns.
        Parameters
        ----------
        dataframes: list of dataframes, containing the values to compute stats
                on.
        columns: list of strs, list of columns to compute the stats on.
        """
        assert (not isinstance(columns, basestring)), "columns should be a " \
                                                      "list of strs,  " \
                                                      "not a str!"
        assert isinstance(columns, list), "columns should be a list!"

        self._columns = columns
        self._counters = dict((column, StatCounter()) for column in columns)

        for df in dataframes:
            self.merge(df)
Пример #10
0
 def test_merge_stats(self):
     stats1 = StatCounter([1.0, 2.0, 3.0, 4.0])
     stats2 = StatCounter([1.0, 2.0, 3.0, 4.0])
     stats = stats1.mergeStats(stats2)
     self.assertEqual(stats.count(), 8)
     self.assertEqual(stats.max(), 4.0)
     self.assertEqual(stats.mean(), 2.5)
     self.assertEqual(stats.min(), 1.0)
     self.assertAlmostEqual(stats.stdev(), 1.118033988749895)
     self.assertAlmostEqual(stats.sampleStdev(), 1.1952286093343936)
     self.assertEqual(stats.sum(), 20.0)
     self.assertAlmostEqual(stats.variance(), 1.25)
     self.assertAlmostEqual(stats.sampleVariance(), 1.4285714285714286)
     execution_statements = [
         StatCounter([1.0, 2.0]).mergeStats(StatCounter(range(1, 301))),
         StatCounter(range(1, 301)).mergeStats(StatCounter([1.0, 2.0])),
     ]
     for stats in execution_statements:
         self.assertEqual(stats.count(), 302)
         self.assertEqual(stats.max(), 300.0)
         self.assertEqual(stats.min(), 1.0)
         self.assertAlmostEqual(stats.mean(), 149.51324503311)
         self.assertAlmostEqual(stats.variance(), 7596.302804701549)
         self.assertAlmostEqual(stats.sampleVariance(), 7621.539691095905)
Пример #11
0
 def merge(self, frame):
     """
     Add another DataFrame to the accumulated stats for each column.
     Parameters
     ----------
     frame: pandas DataFrame we will update our stats counter with.
     """
     for column_name, _ in self._column_stats.items():
         data_arr = frame[[column_name]].values
         count, min_max_tup, mean, _, _, _ = \
             scistats.describe(data_arr)
         stats_counter = StatCounter()
         stats_counter.n = count
         stats_counter.mu = mean
         stats_counter.m2 = np.sum((data_arr - mean)**2)
         stats_counter.minValue, stats_counter.maxValue = min_max_tup
         self._column_stats[column_name] = self._column_stats[
             column_name].mergeStats(stats_counter)
     return self
Пример #12
0
 def merge(self, frame):
     """
     Add another DataFrame to the accumulated stats for each column.
     Parameters
     ----------
     frame: pandas DataFrame we will update our stats counter with.
     """
     for column_name, counter in self._column_stats.items():
         data_arr = frame[[column_name]].values
         count, min_max_tup, mean, unbiased_var, skew, kurt = \
             scistats.describe(data_arr)
         stats_counter = StatCounter()
         stats_counter.n = count
         stats_counter.mu = mean
         # TODO(juliet): look up paper they base their streams tat alg on,
         # write docs for statcounter class in spark
         # line below will likely need to be modified to match the alg
         stats_counter.m2 = np.sum((data_arr - mean)**2)
         stats_counter.minValue, stats_counter.maxValue = min_max_tup
         self._column_stats[column_name] = self._column_stats[
             column_name].mergeStats(stats_counter)
     return self
 def stats(d):
     s = StatCounter()
     return s.merge(((d.days * 24 * 3600) + d.seconds))
Пример #14
0
def get_lr_curves(
    spark,
    features_df,
    cluster_ids,
    kernel_bandwidth,
    num_pdf_points,
    random_seed=None,
):
    """ Compute the likelihood ratio curves for clustered clients.

    Work-flow followed in this function is as follows:

     * Access the DataFrame including cluster numbers and features.
     * Load same similarity function that will be used in TAAR module.
     * Iterate through each cluster and compute in-cluster similarity.
     * Iterate through each cluster and compute out-cluster similarity.
     * Compute the kernel density estimate (KDE) per similarity score.
     * Linearly down-sample both PDFs to 1000 points.

    :param spark: the SparkSession object.
    :param features_df: the DataFrame containing the user features (e.g. the
                        ones coming from |get_donors|).
    :param cluster_ids: the list of cluster ids (e.g. the one coming from |get_donors|).
    :param kernel_bandwidth: the kernel bandwidth used to estimate the kernel densities.
    :param num_pdf_points: the number of points to sample for the LR-curves.
    :param random_seed: the provided random seed (fixed in tests).
    :return: A list in the following format
        [(idx, (lr-numerator-for-idx, lr-denominator-for-idx)), (...), ...]
    """

    # Instantiate holder lists for inter- and intra-cluster scores.
    same_cluster_scores_rdd = spark.sparkContext.emptyRDD()
    different_clusters_scores_rdd = spark.sparkContext.emptyRDD()

    random_split_kwargs = {"seed": random_seed} if random_seed else {}

    for cluster_number in cluster_ids:
        # Pick the features for users belonging to the current cluster.
        current_cluster_df = features_df.where(
            col("prediction") == cluster_number)
        # Pick the features for users belonging to all the other clusters.
        other_clusters_df = features_df.where(
            col("prediction") != cluster_number)

        logger.debug("Computing scores for cluster",
                     extra={"cluster_id": cluster_number})

        # Compares the similarity score between pairs of clients in the same cluster.
        cluster_half_1, cluster_half_2 = current_cluster_df.rdd.randomSplit(
            [0.5, 0.5], **random_split_kwargs)
        pair_rdd = generate_non_cartesian_pairs(cluster_half_1, cluster_half_2)
        intra_scores_rdd = pair_rdd.map(lambda r: similarity_function(*r))
        same_cluster_scores_rdd = same_cluster_scores_rdd.union(
            intra_scores_rdd)

        # Compares the similarity score between pairs of clients in different clusters.
        pair_rdd = generate_non_cartesian_pairs(current_cluster_df.rdd,
                                                other_clusters_df.rdd)
        inter_scores_rdd = pair_rdd.map(lambda r: similarity_function(*r))
        different_clusters_scores_rdd = different_clusters_scores_rdd.union(
            inter_scores_rdd)

    # Determine a range of observed similarity values linearly spaced.
    all_scores_rdd = same_cluster_scores_rdd.union(
        different_clusters_scores_rdd)
    stats = all_scores_rdd.aggregate(StatCounter(), StatCounter.merge,
                                     StatCounter.mergeStats)
    min_similarity = stats.minValue
    max_similarity = stats.maxValue
    lr_index = np.arange(
        min_similarity,
        max_similarity,
        float(abs(min_similarity - max_similarity)) / num_pdf_points,
    )

    # Kernel density estimate for the inter-cluster comparison scores.
    kd_dc = KernelDensity()
    kd_dc.setSample(different_clusters_scores_rdd)
    kd_dc.setBandwidth(kernel_bandwidth)
    denominator_density = kd_dc.estimate(lr_index)

    # Kernel density estimate for the intra-cluster comparison scores.
    kd_sc = KernelDensity()
    kd_sc.setSample(same_cluster_scores_rdd)
    kd_sc.setBandwidth(kernel_bandwidth)
    numerator_density = kd_sc.estimate(lr_index)

    # Structure this in the correct output format.
    return list(
        zip(lr_index, list(zip(numerator_density, denominator_density))))
Пример #15
0
 def chooseBandwidthList(self):
     stddev = StatCounter(self.samples).stdev()
     return 1.06 * stddev * math.pow(len(self.samples), -.2)
Пример #16
0
 def __init__(self):
     self.stats = StatCounter()
     self.missing = long(0)
 def __init__(self):
     self.stats = StatCounter()
     self.missing = long(0)
Пример #18
0
 def test_merge(self):
     stats = StatCounter([1.0, 2.0, 3.0, 4.0])
     stats.merge(5.0)
     self.assertEqual(stats.count(), 5)
     self.assertEqual(stats.max(), 5.0)
     self.assertEqual(stats.mean(), 3.0)
     self.assertEqual(stats.min(), 1.0)
     self.assertAlmostEqual(stats.stdev(), 1.414213562373095)
     self.assertAlmostEqual(stats.sampleStdev(), 1.5811388300841898)
     self.assertEqual(stats.sum(), 15.0)
     self.assertAlmostEqual(stats.variance(), 2.0)
     self.assertAlmostEqual(stats.sampleVariance(), 2.5)