def test_sorts_points(self): def vk(pt): return pt.value pts = [MockPoint(n) for n in range(10, 1, -1)] pvs = get_percentiles_for_points(pts) self.assertFalse(is_sorted(pts, key=vk)) assign_percentiles_to_points(pts, pvs) self.assertTrue(is_sorted(pts, key=vk))
def test_adds_percentiles(self): pts = [MockPoint(n) for n in range(10, 1, -1)] pvs = get_percentiles_for_points(pts) none_have_percentile = reduce(lambda sofar, this: sofar and (this.rank is None), pts) self.assertTrue(none_have_percentile) assign_percentiles_to_points(pts, pvs) all_have_percentiles = reduce(lambda sofar, this: sofar and (this.rank is not None), pts) self.assertTrue(all_have_percentiles)
def test_uses_given_percentiles(self): ps = [n / 1000 for n in range(1, 1000)] ps_check = set(ps) def checkset(v): return v is None or v in ps_check pts = [MockPoint(n) for n in range(100, 3, -4)] pvs = get_percentiles_for_points(pts) assign_percentiles_to_points(pts, pvs) all_in_set = reduce(lambda sofar, this: sofar and checkset(this.rank), pts) self.assertTrue(all_in_set)
def values_less_than_their_percentile(self): # create points for our values (1-100) pts = [MockPoint(v) for v in range(1, 101)] pvs = get_percentiles_for_points(pts) # turn these into into a dictionary for fast lookups percentile_values = {p: pv for (p, pv) in pvs} assign_percentiles_to_points(pts) # now, for each point, we can lookup the cut off for the percentile it was assigned, # and check that this value is larger than the value of the point (i.e. confirm that # the point fits inside the bucket it was assigned to) for pt in pts: with self.subTest(value=pt.value): value = pt.value rank = pt.rank if rank is not None: # because this can happen... percentile_value = percentile_values[rank] self.assertLessEqual(value, percentile_value)
def handle(self, *args, **options): indicator_name = options['indicator'] year = options['year'] max_points = options['count'] if options[ 'count'] else US_County.objects.count() mean = options['mean'] stddev = options['sigma'] if options['sigma'] else mean / 5 self.stdout.write(f"Creating random data for {indicator_name}, {year}") # https://docs.djangoproject.com/en/2.1/ref/models/querysets/#get-or-create indicator, _ = Health_Indicator.objects.get_or_create( name=indicator_name) data_set = _create_data_set(indicator, year) self.stdout.write( f"Using indicator w/ id {indicator.id}, data set w/ id {data_set.id}" ) random.seed() points = _create_data_points(data_set, max_points, mean, stddev) self.stdout.write(f"Created {len(points)} new data points") self.stdout.write("Adding percentiles...") percentile_values = get_percentiles_for_points(points) assign_percentiles_to_points(points, percentile_values) pv_models = [ Percentile(rank=p, value=pv, data_set=data_set) for (p, pv) in percentile_values ] # https://docs.djangoproject.com/en/2.1/ref/models/querysets/#bulk-create # it mentions several caveats, but it seems sufficient for this use case self.stdout.write("Saving data points") Data_Point.objects.bulk_create(points) Percentile.objects.bulk_create(pv_models)