def sample_stats(views, is_spam, biases, num_samples): """Output some statistics associated with taking a sample. Args: is_positive: numpy.array of values indicating if the record is a positive class. scores: numpy.array of values indicating the model score. num_samples: Number of samples to take. bias_func: Function to apply to @scores. Serves to bias the sampling procedure. Returns: a dictionary with the following entries 'prevalence' - The estimate of the percentage of items that are positive in the overall population. 'sampled_positive_percent' - The number of items sampled that are positive. """ index, p_sample = ml_sampler.biased_sample( biases=biases, weights=views, num_samples=num_samples, ) sample_weights = views[index] sample_is_spam = is_spam[index] est_pos_volume = ml_sampler.estimator( sample_weights, p_sample, sample_is_spam, ) confidence_interval = ml_sampler.estimated_confidence_interval( sample_weights, p_sample, sample_is_spam, ) # H-H Estimator of prevalence est_prevalence = est_pos_volume / views.sum() * 100.0 confidence_interval = confidence_interval / views.sum() * 100.0 true_prevalence = (views * is_spam).sum() / views.sum() * 100.0 coverage = confidence_interval - true_prevalence # Percent of sampled entries that are positive. If this is greater # than prevalence then we have over-sampled positive examples. sampled_positive_percent = sample_is_spam.mean() * 100.0 return { 'true_prevalence': true_prevalence, 'est_prevalence': est_prevalence, 'sampled_positive_percent': sampled_positive_percent, 'lower_bound': confidence_interval[0], 'upper_bound': confidence_interval[1], 'bound_width': np.ptp(confidence_interval), 'coverage': coverage[0] <= 0 and coverage[1] >= 0 }
def test_pdf(self): sample_index, p_sample = ml_sampler.biased_sample( biases=ml_sampler.interpolated_pdf_reciprocal(self.scores), weights=self.importance, num_samples=self.num_samples) est_prevalence = ml_sampler.estimator(self.importance[sample_index], p_sample, self.is_positive[sample_index]) est_prevalence /= self.importance.sum() self.equal_assert(est_prevalence)
def test_sample(self): sample_index, p_sample = ml_sampler.biased_sample( biases=np.ones(self.size), weights=self.importance, num_samples=self.num_samples) est_prevalence = ml_sampler.estimator(self.importance[sample_index], p_sample, self.is_positive[sample_index]) est_prevalence /= self.importance.sum() self.equal_assert(est_prevalence)
def sample_stats(views, is_spam, scores, num_samples, bias_func=None): """Output some statistics associated with taking a sample. Args: is_positive: numpy.array of values indicating if the record is a positive class. scores: numpy.array of values indicating the model score. num_samples: Number of samples to take. bias_func: Function to apply to @scores. Serves to bias the sampling procedure. Returns: a dictionary with the following entries 'prevalence' - The estimate of the percentage of items that are positive in the overall population. 'sampled_positive_percent' - The number of items sampled that are positive. """ if bias_func is None: def bias_func(x): return x index, p_sample = ml_sampler.biased_sample( biases=bias_func(scores), weights=views, num_samples=num_samples, ) sample_weights = views[index] sample_is_spam = is_spam[index] est_pos_volume = ml_sampler.estimator( sample_weights, p_sample, sample_is_spam, ) # H-T Estimator of prevalence prevalence = est_pos_volume / views.sum() * 100.0 # Percent of sampled entries that are positive. If this is greater # than prevalence then we have over-sampled positive examples. sampled_positive_percent = sample_is_spam.mean() * 100.0 return { 'prevalence': prevalence, 'sampled_positive_percent': sampled_positive_percent, }
def test_bin_weights(self): bins = np.linspace(self.scores.min(), self.scores.max(), 10) bias = ml_sampler.bin_weights_corrected(self.scores, bins, bin_weights=np.linspace( 1, 10, 10)) sample_index, p_sample = ml_sampler.biased_sample( biases=bias, weights=self.importance, num_samples=self.num_samples) est_prevalence = ml_sampler.estimator(self.importance[sample_index], p_sample, self.is_positive[sample_index]) est_prevalence /= self.importance.sum() self.equal_assert(est_prevalence)