def test_contains_works_with_AgeBin_and_string(self): big_ab = AgeBin(10, 99) ab = "[10, 15)" self.assertTrue(big_ab.contains(ab)) ab = AgeBin.from_string(ab) self.assertTrue(big_ab.contains(ab))
def test_upsample_agebin_raises_if_data_does_not_contain_requested_agebin( self): # overlap lower age age_bin = AgeBin.from_string('[0:10)') self.assertRaises(NotUpsampleable, self.grouped_data.apply, upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column) # overlap upper age age_bin = AgeBin.from_string('[10:18)') self.assertRaises(NotUpsampleable, self.grouped_data.apply, upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column) # overlap both lower and upper ages (requesting too large an age range on both sides) age_bin = AgeBin.from_string('[4:16)') self.assertRaises(NotUpsampleable, self.grouped_data.apply, upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column)
def test_upsample_agebin_raises_if_data_not_edge_aligned(self): # not lower-edge aligned age_bin = AgeBin.from_string('[6:10)') self.assertRaises(NotUpsampleable, self.grouped_data.apply, upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column) # not upper-edge aligned age_bin = AgeBin.from_string('[5:14)') self.assertRaises(NotUpsampleable, self.grouped_data.apply, upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column) # not lower or upper edge aligned age_bin = AgeBin.from_string('[6:14)') self.assertRaises(NotUpsampleable, self.grouped_data.apply, upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column)
def test_equality_comparison(self): ab1 = AgeBin(0, 99) ab2 = AgeBin(0, 99) self.assertTrue(ab1 == ab2) ab2 = AgeBin(0, 98) self.assertFalse(ab1 == ab2) ab2 = AgeBin(1, 99) self.assertFalse(ab1 == ab2)
def test_instantiation(self): ab = AgeBin(15,49) self.assertEqual(ab.start, 15) self.assertEqual(ab.end, 49) self.assertEqual(ab.delimiter, AgeBin.DEFAULT_DELIMITER) ab = AgeBin(0,99,delimiter=', ') self.assertEqual(ab.start, 0) self.assertEqual(ab.end, 99) self.assertEqual(ab.delimiter, ', ')
def test_merge_raises_if_not_consecutive_ages(self): ab1 = AgeBin(10, 15) ab2 = AgeBin(100, 200) self.assertRaises(AgeBin.NotMergeable, ab1.merge, other_bin=ab2) self.assertRaises(AgeBin.NotMergeable, ab2.merge, other_bin=ab1) ab2=AgeBin(5, 10) # wrong order self.assertRaises(AgeBin.NotMergeable, ab1.merge, other_bin=ab2) # right order expected_ab = AgeBin(5, 15) self.assertEqual(ab2.merge(ab1), expected_ab)
def test_merge_works_with_AgeBin_and_string(self): expected_ab = AgeBin(10, 99) ab1 = AgeBin(10, 15) ab2 = "[15:99)" self.assertEqual(ab1.merge(ab2), expected_ab) ab2 = AgeBin.from_string(ab2) self.assertEqual(ab1.merge(ab2), expected_ab)
def upsample_agebin(grouped_data, age_bin, aggregated_cols, weighted_cols, weighting_col): """ Upsample a pandas DataFrame object containing a AgeBin column to the requested age_bin. Intended to be supplied as a dataframe groupby argument (to run on each group). It is ok for data outside the requested upsample range to be in this dataframe; it will simply be excluded in the result. Example usage: age_stratified_dataframe.groupby(['Year', 'Gender'].apply(upsample_agebin, AgeBin(15, 49)) :param grouped_data: a pandas DataFrameGroupBy object, see above. :param age_bin: an AgeBin object representing inclusive lower and exclusive upper bounds. :param weighted_cols: columns in the grouped data/dataframe to do weighted sums of :return: A pandas DataFrame object with one row conataining the requested AgeBin-upsampled result """ # Further notes: # verify we can do the requested upsample; this requires EXACT stitching of 'AgeBin' values to contain age_bin, # though data outside the requested range will be ignored here and in the upsaampling. if not AgeBin.can_upsample_bins(grouped_data['AgeBin'], age_bin): raise NotUpsampleable('Cannot upsample to age bin: %s . Data is missing.' % age_bin) # filter out data rows that are out of our requested age range, e.g. [50:55) is not in range of [15:49) filtered_df = grouped_data.loc[[age_bin.contains(ab) for ab in grouped_data['AgeBin']]] # grab row 0 and keep it as our base result; apply upsampled AgeBin result = grouped_data[0:1].reset_index(drop=True) result['AgeBin'] = str(age_bin) # aggregated data items total_weight = None for channel in aggregated_cols: total = np.sum(filtered_df[channel]) result[channel] = total if channel == weighting_col: # hacky special case for use next total_weight = total # weighted sum items: model and reference data fraction = filtered_df[weighting_col] / total_weight for channel in weighted_cols: result[channel] = np.sum(fraction * filtered_df[channel]) return result
def test_from_string_works_properly(self): ab_string = '[0:;:99)' ab = AgeBin.from_string(ab_string) self.assertEqual(ab.start, 0) self.assertEqual(ab.end, 99) self.assertEqual(ab.delimiter, ':;:')
def test_can_upample_bins_raises_if_target_bin_not_contained_by_bins(self): bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)] self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 16))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(-1, 15))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(15, 99))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(16, 99))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(-5, 0))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(-5, 1)))
def test_merge_bins_works_with_AgeBin_and_string(self): bins = ['[0:49)', AgeBin(49, 99)] merged = AgeBin.merge_bins(bins=bins) expected = AgeBin(0,99) self.assertEqual(merged, expected)
def test_upsample_agebin_works(self): age_bin = AgeBin.from_string('[5:10)') result = self.grouped_data.apply( upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column).sort_values( 'Gender').reset_index(drop=True).sort_index() expected_result = [{ 'Gender': 'Male', 'AgeBin': '[5:10)', 'Prevalence': 0.1, 'Sim_Prevalence': 0.4, 'Count': 5 }, { 'Gender': 'Female', 'AgeBin': '[5:10)', 'Prevalence': 0.3, 'Sim_Prevalence': 0.2, 'Count': 20 }] expected_result = pd.DataFrame(expected_result).sort_values( 'Gender').reset_index(drop=True).sort_index() self.assertTrue(result.equals(expected_result)) age_bin = AgeBin.from_string('[10:15)') result = self.grouped_data.apply( upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column).sort_values( 'Gender').reset_index(drop=True).sort_index() expected_result = [{ 'Gender': 'Male', 'AgeBin': '[10:15)', 'Prevalence': 0.2, 'Sim_Prevalence': 0.3, 'Count': 15 }, { 'Gender': 'Female', 'AgeBin': '[10:15)', 'Prevalence': 0.4, 'Sim_Prevalence': 0.1, 'Count': 20 }] expected_result = pd.DataFrame(expected_result).sort_values( 'Gender').reset_index(drop=True).sort_index() self.assertTrue(result.equals(expected_result)) age_bin = AgeBin.from_string('[5:15)') result = self.grouped_data.apply( upsample_agebin, age_bin=age_bin, aggregated_cols=self.aggregation_columns, weighted_cols=self.weighted_columns, weighting_col=self.weighting_column).sort_values( 'Gender').reset_index(drop=True).sort_index() expected_result = [{ 'Gender': 'Male', 'AgeBin': '[5:15)', 'Prevalence': 0.175, 'Sim_Prevalence': 0.325, 'Count': 20 }, { 'Gender': 'Female', 'AgeBin': '[5:15)', 'Prevalence': 0.35, 'Sim_Prevalence': 0.15, 'Count': 40 }] expected_result = pd.DataFrame(expected_result).sort_values( 'Gender').reset_index(drop=True).sort_index() numerical_cols = ['Prevalence', 'Sim_Prevalence', 'Count'] other_cols = ['Gender', 'AgeBin'] # checking that numerical values are REALLY close; off a bit due to division in algorithm self.assertTrue( np.allclose(result[numerical_cols], expected_result[numerical_cols], atol=1e-16, rtol=0)) # checking non-numerical values are EXACT self.assertTrue(result[other_cols].equals(expected_result[other_cols]))
def test_merge_bins_works_with_unsorted_AgeBins(self): bins = [AgeBin(49, 99), AgeBin(0,49)] merged = AgeBin.merge_bins(bins=bins) expected = AgeBin(0,99) self.assertEqual(merged, expected)
def test_contains_works_properly(self): big_ab = AgeBin(10, 99) # testing a variety of edge cases, both 'contained' and not 'contained' ab = AgeBin(0, 9) self.assertFalse(big_ab.contains(ab)) ab = AgeBin(0, 10) self.assertFalse(big_ab.contains(ab)) ab = AgeBin(0, 11) self.assertFalse(big_ab.contains(ab)) ab = AgeBin(98, 200) self.assertFalse(big_ab.contains(ab)) ab = AgeBin(99, 200) self.assertFalse(big_ab.contains(ab)) ab = AgeBin(100, 200) self.assertFalse(big_ab.contains(ab)) ab = AgeBin(10, 99) self.assertTrue(big_ab.contains(ab)) self.assertTrue(ab.contains(big_ab)) ab = AgeBin(10, 15) self.assertTrue(big_ab.contains(ab)) ab = AgeBin(15, 30) self.assertTrue(big_ab.contains(ab)) ab = AgeBin(90, 99) self.assertTrue(big_ab.contains(ab)) self.assertFalse(ab.contains(big_ab)) # and check the inverse case...
def test_merge_bins_works_properly(self): bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)] merged = AgeBin.merge_bins(bins=bins) expected = AgeBin(0, 15) self.assertEqual(merged, expected)
def test_merge_sets_proper_delimiter(self): ab1 = AgeBin(5, 10, delimiter='###') ab2 = AgeBin(10, 15) merged = ab1.merge(ab2) self.assertNotEqual(ab1.delimiter, ab2.delimiter) self.assertEqual(merged.delimiter, '###')
def test_merge_bins_raises_if_unmergeable(self): bins = [AgeBin(0, 5), AgeBin(6, 10)] self.assertRaises(AgeBin.NotMergeable, AgeBin.merge_bins, bins=bins)
def test_can_upsample_bins_works_properly(self): bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)] self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 15))) self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 10))) self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 10))) self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 15))) self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 5))) self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(10, 15)))
def test_can_upsample_bins_works_if_no_bins_are_provided(self): bins = [] self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 5)))
def test_can_upsample_bins_raises_if_target_bin_edges_do_not_line_up(self): bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)] self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 15))) # a variety of misalignments relative to stated bins self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 14))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 11))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(4, 10))) self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(1, 15)))
def test_test_can_upsample_bins_works_with_AgeBin_and_string(self): bins = [AgeBin(0, 5), '[5:::10)', AgeBin(10, 15)] self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 15)))