예제 #1
0
def sample_stats(views, is_spam, biases, num_samples):
    """Output some statistics associated with taking a sample.
    Args:
        is_positive: numpy.array of values indicating if the record is a
            positive class.
        scores: numpy.array of values indicating the model score.
        num_samples: Number of samples to take.
        bias_func: Function to apply to @scores. Serves to bias the sampling
            procedure.

    Returns: a dictionary with the following entries
        'prevalence' - The estimate of the percentage of items that are
            positive in the overall population.
        'sampled_positive_percent' - The number of items sampled that are
            positive.
    """

    index, p_sample = ml_sampler.biased_sample(
        biases=biases,
        weights=views,
        num_samples=num_samples,
    )

    sample_weights = views[index]
    sample_is_spam = is_spam[index]

    est_pos_volume = ml_sampler.estimator(
        sample_weights,
        p_sample,
        sample_is_spam,
    )

    confidence_interval = ml_sampler.estimated_confidence_interval(
        sample_weights,
        p_sample,
        sample_is_spam,
    )

    # H-H Estimator of prevalence
    est_prevalence = est_pos_volume / views.sum() * 100.0
    confidence_interval = confidence_interval / views.sum() * 100.0

    true_prevalence = (views * is_spam).sum() / views.sum() * 100.0

    coverage = confidence_interval - true_prevalence

    # Percent of sampled entries that are positive. If this is greater
    #  than prevalence then we have over-sampled positive examples.
    sampled_positive_percent = sample_is_spam.mean() * 100.0

    return {
        'true_prevalence': true_prevalence,
        'est_prevalence': est_prevalence,
        'sampled_positive_percent': sampled_positive_percent,
        'lower_bound': confidence_interval[0],
        'upper_bound': confidence_interval[1],
        'bound_width': np.ptp(confidence_interval),
        'coverage': coverage[0] <= 0 and coverage[1] >= 0
    }
예제 #2
0
    def test_pdf(self):

        sample_index, p_sample = ml_sampler.biased_sample(
            biases=ml_sampler.interpolated_pdf_reciprocal(self.scores),
            weights=self.importance,
            num_samples=self.num_samples)

        est_prevalence = ml_sampler.estimator(self.importance[sample_index],
                                              p_sample,
                                              self.is_positive[sample_index])
        est_prevalence /= self.importance.sum()
        self.equal_assert(est_prevalence)
예제 #3
0
    def test_sample(self):

        sample_index, p_sample = ml_sampler.biased_sample(
            biases=np.ones(self.size),
            weights=self.importance,
            num_samples=self.num_samples)

        est_prevalence = ml_sampler.estimator(self.importance[sample_index],
                                              p_sample,
                                              self.is_positive[sample_index])
        est_prevalence /= self.importance.sum()
        self.equal_assert(est_prevalence)
예제 #4
0
def sample_stats(views, is_spam, scores, num_samples, bias_func=None):
    """Output some statistics associated with taking a sample.
    Args:
        is_positive: numpy.array of values indicating if the record is a
            positive class.
        scores: numpy.array of values indicating the model score.
        num_samples: Number of samples to take.
        bias_func: Function to apply to @scores. Serves to bias the sampling
            procedure.

    Returns: a dictionary with the following entries
        'prevalence' - The estimate of the percentage of items that are
            positive in the overall population.
        'sampled_positive_percent' - The number of items sampled that are
            positive.
    """

    if bias_func is None:
        def bias_func(x):
            return x

    index, p_sample = ml_sampler.biased_sample(
        biases=bias_func(scores),
        weights=views,
        num_samples=num_samples,
    )

    sample_weights = views[index]
    sample_is_spam = is_spam[index]

    est_pos_volume = ml_sampler.estimator(
        sample_weights,
        p_sample,
        sample_is_spam,
    )

    # H-T Estimator of prevalence
    prevalence = est_pos_volume / views.sum() * 100.0

    # Percent of sampled entries that are positive. If this is greater
    #  than prevalence then we have over-sampled positive examples.
    sampled_positive_percent = sample_is_spam.mean() * 100.0

    return {
        'prevalence': prevalence,
        'sampled_positive_percent': sampled_positive_percent,
    }
예제 #5
0
    def test_bin_weights(self):

        bins = np.linspace(self.scores.min(), self.scores.max(), 10)

        bias = ml_sampler.bin_weights_corrected(self.scores,
                                                bins,
                                                bin_weights=np.linspace(
                                                    1, 10, 10))

        sample_index, p_sample = ml_sampler.biased_sample(
            biases=bias, weights=self.importance, num_samples=self.num_samples)

        est_prevalence = ml_sampler.estimator(self.importance[sample_index],
                                              p_sample,
                                              self.is_positive[sample_index])
        est_prevalence /= self.importance.sum()
        self.equal_assert(est_prevalence)