def test_group_targets(self): data = pd.read_csv(self.path_data_B) meta = json.load(open(self.path_meta_B)) weight = '_'.join(['weights', self.scheme_name_A3]) # Run weights for scheme_A3 engine_B = WeightEngine(data=data, meta=meta) engine_B.add_scheme(scheme=self.scheme_A3, key='identity', verbose=False) engine_B.run() data_A3 = engine_B.dataframe("scheme_name_A3") # check identical weighted column frequencies df = data_A3.pivot_table(values=[weight], index=['profile_gender'], columns=['age_group'], aggfunc='sum') for column in df.columns.tolist(): self.assertTrue( numpy.allclose(df[column].values, numpy.array([1.645, 1.855]))) # check weighted group frequencies have equal proportions values = data_A3.pivot_table(values=[weight], index=['age_group'], aggfunc='sum').values self.assertTrue(numpy.allclose(values, 3.5))
def test_wdf_structure(self): data = pd.read_csv(self.path_data_exA) engine = WeightEngine(data) targets_gender = [45.6, 54.4] targets_locality = [10, 15, 20, 25, 30] weight_targets = [{ 'gender': {code: prop for code, prop in enumerate(targets_gender, start=1)} }, { 'locality': { code: prop for code, prop in enumerate(targets_locality, start=1) } }] scheme = Rim('complex_filter') scheme.add_group(name='W1, male', filter_def='Wave==1 & religion==1', targets=weight_targets) scheme.add_group(name='W2, female', filter_def='Wave==2 & religion==2', targets=weight_targets) engine.add_scheme(scheme, key='unique_id', verbose=False) engine.run() wdf = engine.dataframe('complex_filter') self.assertEqual( sorted(wdf.columns.tolist()), sorted([ 'unique_id', 'gender', 'locality', 'weights_complex_filter', 'Wave', 'religion' ])) self.assertTrue(len(wdf.index) == 596)
class TestEngine(unittest.TestCase): def setUp(self): ''' Simple engine without meta - engine_A ''' self.path = './tests/' # self.path = '' name_data_A = 'engine_A' self.path_data_A = '{}{}_data.csv'.format(self.path, name_data_A) name_data_B = 'engine_B' self.path_meta_B = '{}{}_meta.json'.format(self.path, name_data_B) self.path_data_B = '{}{}_data.csv'.format(self.path, name_data_B) name_data_exA = 'Example Data (A)' self.path_meta_exA = '{}{}.json'.format(self.path, name_data_exA) self.path_data_exA = '{}{}.csv'.format(self.path, name_data_exA) # Setup engine_A data = pd.read_csv(self.path_data_A) self.engine_A = WeightEngine(data=data) self.scheme_name_A1 = 'scheme_name_A1' self.scheme_name_A2 = 'scheme_name_A2' self.scheme_name_A3 = 'scheme_name_A3' # Setup schemes to use in tests self.scheme_A1 = Rim(self.scheme_name_A1) self.scheme_A1.target_cols = ['column1', 'column2'] self.scheme_A1.add_group( name='Senior Type 1', filter_def='column3==1', targets=[{ 'column1': { code: prop for code, prop in enumerate([32.00, 31.00, 37.00], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 23.13, 14.32, 4.78, 4.70, 2.65, 2.61, 3.47, 31.04, 13.3 ], start=1) } }]) self.scheme_A1.add_group( name='Senior Type 2', filter_def='column3==1', targets=[{ 'column1': { code: prop for code, prop in enumerate([33.40, 33.40, 33.20], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.12 ], start=1) } }]) self.scheme_A1.add_group( name='Senior Type 3', filter_def='column3==3', targets=[{ 'column1': { code: prop for code, prop in enumerate([33.2, 29.7, 37.1], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 23.13, 14.32, 4.78, 4.70, 2.65, 2.61, 3.47, 31.04, 13.3 ], start=1) } }]) self.scheme_A1.add_group( name='Senior Type 4', filter_def='column3==4', targets=[{ 'column1': { code: prop for code, prop in enumerate([33.2, 29.7, 37.1], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 23.13, 14.32, 4.78, 4.70, 2.65, 2.61, 3.47, 32.34, 12.00 ], start=1) } }]) self.scheme_A2 = Rim(self.scheme_name_A2) self.scheme_A2.target_cols = ['column1', 'column2'] self.scheme_A2.add_group( name='Senior Type 1', filter_def='column3==1', targets=[{ 'column1': { code: prop for code, prop in enumerate([37.00, 32.00, 31.00], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 13.3, 23.13, 14.32, 4.78, 4.70, 2.65, 2.61, 3.47, 31.04 ], start=1) } }]) self.scheme_A2.add_group( name='Senior Type 2', filter_def='column3==1', targets=[{ 'column1': { code: prop for code, prop in enumerate([33.2, 33.40, 33.40], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.12 ], start=1) } }]) self.scheme_A2.add_group( name='Senior Type 3', filter_def='column3==3', targets=[{ 'column1': { code: prop for code, prop in enumerate([37.1, 33.2, 29.7], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 13.3, 23.13, 14.32, 4.78, 4.70, 2.65, 2.61, 3.47, 31.04 ], start=1) } }]) self.scheme_A2.add_group( name='Senior Type 4', filter_def='column3==4', targets=[{ 'column1': { code: prop for code, prop in enumerate([37.1, 33.2, 29.7], start=1) } }, { 'column2': { code: prop for code, prop in enumerate([ 12.00, 23.13, 14.32, 4.78, 4.70, 2.65, 2.61, 3.47, 32.34 ], start=1) } }]) self.scheme_A3 = Rim(self.scheme_name_A3) self.scheme_A3.target_cols = ['profile_gender'] self.scheme_A3.targets = [{'profile_gender': {1: 47, 2: 53}}] self.scheme_A3.add_group(name='11-19', filter_def='age_group==2', targets=self.scheme_A3.targets) self.scheme_A3.add_group(name='31-39', filter_def='age_group==4', targets=self.scheme_A3.targets) self.scheme_A3.add_group(name='41-49', filter_def='age_group==5', targets=self.scheme_A3.targets) self.scheme_A3.add_group(name='51-59', filter_def='age_group==6', targets=self.scheme_A3.targets) self.scheme_A3.group_targets({ '11-19': 25, '31-39': 25, '41-49': 25, '51-59': 25 }) ''' Complex engine with meta - engine_B ''' data = pd.read_csv(self.path_data_B) meta = json.load(open(self.path_meta_B)) self.scheme_name_B1 = 'scheme_name_B1' engine_B = WeightEngine(data=data, meta=meta) # Setup schemes to use in tests self.scheme_B1 = Rim(self.scheme_name_B1) self.scheme_B1.target_cols = ['profile_gender', 'age_group'] # self.scheme_B1.set_targets() def test_constructor(self): data = pd.read_csv(self.path_data_B) meta = json.load(open(self.path_meta_B)) engine_B = WeightEngine(data=data, meta=meta) self.assertIsNotNone(engine_B._df) self.assertTrue(engine_B.dropna) self.assertEqual(engine_B.schemes, {}) self.assertIsInstance(engine_B.schemes, dict) def test_add_scheme_and_dataframe(self): #A list of scheme names used in setUp used for comparison scheme_names = [self.scheme_name_A1, self.scheme_name_A2] self.engine_A.add_scheme(scheme=self.scheme_A2, key='identity', verbose=False) # Should now contain a dict with scheme_name_A2 as the first key self.assertEqual(self.engine_A.schemes.keys()[0], self.scheme_name_A2) self.engine_A.add_scheme(scheme=self.scheme_A1, key='identity', verbose=False) # Should now contain a dict with scheme_name_A2 and scheme_name_A1 as keys for key in self.engine_A.schemes: self.assertIn(key, scheme_names) self.assertIn('identity', self.engine_A.schemes[key]['key']) # Sets weights_scheme_name_A1 and weights_scheme_name_A2 to ones self.engine_A._df[self.scheme_A1._weight_name()] = pd.np.ones( len(self.engine_A._df)) self.engine_A._df[self.scheme_A2._weight_name()] = pd.np.ones( len(self.engine_A._df)) for key in self.engine_A.schemes: weight_scheme = self.engine_A._df['weights_' + key] boolean_vector = (weight_scheme == pd.np.ones(len(weight_scheme))) self.assertTrue(boolean_vector.all()) self.engine_A.run(schemes=[key]) boolean_vector = (weight_scheme == pd.np.ones(len(weight_scheme))) self.assertFalse(boolean_vector.all()) def test_add_scheme_no_key(self): self.engine_A.add_scheme(scheme=self.scheme_A1, key='identity', verbose=False) self.assertIsNotNone(self.engine_A.schemes[self.scheme_name_A1]['key']) def test_weight_lazy(self): return self.engine_A.add_scheme(scheme=self.scheme_A2, key='identity', verbose=False) self.engine_A.add_scheme(scheme=self.scheme_A1, key='identity', verbose=False) self.assertNotIn('weights_scheme_name_A2', self.engine_A._df.columns) self.engine_A.weight() self.assertIn('weights_%s' % self.scheme_name_A1, self.engine_A._df.columns) self.assertIn('weights_%s' % self.scheme_name_A2, self.engine_A._df.columns) def test_group_targets(self): data = pd.read_csv(self.path_data_B) meta = json.load(open(self.path_meta_B)) weight = '_'.join(['weights', self.scheme_name_A3]) # Run weights for scheme_A3 engine_B = WeightEngine(data=data, meta=meta) engine_B.add_scheme(scheme=self.scheme_A3, key='identity', verbose=False) engine_B.run() data_A3 = engine_B.dataframe("scheme_name_A3") # check identical weighted column frequencies df = data_A3.pivot_table(values=[weight], index=['profile_gender'], columns=['age_group'], aggfunc='sum') for column in df.columns.tolist(): self.assertTrue( numpy.allclose(df[column].values, numpy.array([1.645, 1.855]))) # check weighted group frequencies have equal proportions values = data_A3.pivot_table(values=[weight], index=['age_group'], aggfunc='sum').values self.assertTrue(numpy.allclose(values, 3.5)) def test_vaidate_targets(self): data = pd.read_csv(self.path_data_exA) engine = WeightEngine(data) targets_gender = [45.6, 54.4] targets_locality = [10, 15, 20, 25, 30] weight_targets = [{ 'gender': {code: prop for code, prop in enumerate(targets_gender, start=1)} }, { 'locality': { code: prop for code, prop in enumerate(targets_locality, start=1) } }] scheme = Rim('missing_data') scheme.set_targets(weight_targets) engine.add_scheme(scheme, key='unique_id', verbose=False) validate_df = scheme.validate() self.assertTrue(validate_df.columns.tolist() == ['missing', 'mean', 'mode', 'median']) self.assertTrue(validate_df.index.tolist() == ['gender', 'locality']) self.assertTrue(validate_df.values.tolist() == [[0.0, 2.0, 2.0, 2.0], [177.0, 2.0, 1.0, 2.0]]) def test_wdf_structure(self): data = pd.read_csv(self.path_data_exA) engine = WeightEngine(data) targets_gender = [45.6, 54.4] targets_locality = [10, 15, 20, 25, 30] weight_targets = [{ 'gender': {code: prop for code, prop in enumerate(targets_gender, start=1)} }, { 'locality': { code: prop for code, prop in enumerate(targets_locality, start=1) } }] scheme = Rim('complex_filter') scheme.add_group(name='W1, male', filter_def='Wave==1 & religion==1', targets=weight_targets) scheme.add_group(name='W2, female', filter_def='Wave==2 & religion==2', targets=weight_targets) engine.add_scheme(scheme, key='unique_id', verbose=False) engine.run() wdf = engine.dataframe('complex_filter') self.assertTrue(wdf.columns.tolist() == [ 'unique_id', 'gender', 'locality', 'weights_complex_filter', 'religion', 'Wave' ]) self.assertTrue(len(wdf.index) == 596)