def test_fillNAs(self): df = pd.DataFrame({'A': [11, 22, np.nan, 44]}) cfe = CategoricalFeatureExtraction() cfe.fillNAs(df, ['A']) self.assertEqual(df['A'][0], 11, 'Index 0 should be unchanged!') self.assertEqual(df['A'][1], 22, 'Index 1 should be unchanged!') self.assertTrue(df['A'][2] <= -99999999, 'Index 2 should be a large negative number!') self.assertEqual(df['A'][3], 44, 'Index 3 should be unchanged!')
def test_convertColumns_thresholdShouldDetermineIfColumnIsOneHotEncoded(self): CategoricalFeatureExtraction.convertColumns([self.df1, self.df2], ['animal', 'type'], one_hot_threshold=3) #animal should be ordinal, #type should be one-hot for df in [self.df1, self.df2]: for onehot_col in self.type_onehot_columns: self.assertTrue(onehot_col in df.columns) self.assertTrue('type' not in df.columns, 'type column should have been deleted') self.assertTrue('animal' in df.columns)
def test_convertColumnsToOneHot(self): cfe = CategoricalFeatureExtraction() cfe.convertColumnsToOneHot([self.df1, self.df2], ['animal']) for df in [self.df1, self.df2]: for onehot_col in self.animal_onehot_columns: self.assertTrue(onehot_col in df.columns, 'one-hot column \'{0}\' is missing!'.format(onehot_col)) self.assertTrue('animal' not in df.columns, 'animal column should have been deleted!') self.assertTrue('age' in df.columns, 'age column should not have been deleted!')
def test_convertColumnsToOrdinal(self): cfe = CategoricalFeatureExtraction() cfe.convertColumnsToOrdinal([self.df1, self.df2], ['age']) for df in [self.df1, self.df2]: print(df.columns.values) self.assertTrue('animal' in df.columns, 'animal column should not have been deleted!') self.assertTrue('age' in df.columns, 'age column should not have been deleted!') self.assertEqual(self.df1['age'][0], 0, '11 should map to 0!') self.assertEqual(self.df1['age'][1], 1, '22 should map to 1!') self.assertEqual(self.df1['age'][2], 0, '11 should map to 0!') self.assertEqual(self.df1['age'][3], 0, '11 should map to 0!') self.assertEqual(self.df2['age'][0], 0, '11 should map to 0!') self.assertEqual(self.df2['age'][1], 2, '33 should map to 2!') self.assertEqual(self.df2['age'][2], 2, '33 should map to 2!')
def test_removeUnsharedColumns(self): cfe = CategoricalFeatureExtraction() df1 = pd.DataFrame({'A': [1,2,3], 'B': [2,3,4], 'C': [3,4,5]}) df2 = pd.DataFrame({'A': [1,2,3], 'B': [2,3,4]}) df3 = pd.DataFrame({'A': [1,2,3], 'C': [2,3,4]}) df4 = pd.DataFrame({'A': [1,2,3], 'D': [2,3,4]}) cfe.removeUnsharedColumns([df1, df2, df3, df4]) self.assertEqual(len(df1.columns), 1, '{0} columns found for df1 instead of 1!'.format(len(df1.columns))) self.assertEqual(len(df2.columns), 1, '{0} columns found for df2! instead of 1'.format(len(df2.columns))) self.assertEqual(len(df3.columns), 1, '{0} columns found for df3! instead of 1'.format(len(df3.columns))) self.assertEqual(len(df4.columns), 1, '{0} columns found for df4! instead of 1'.format(len(df4.columns))) self.assertEqual(df1.columns[0], 'A', 'column for df1 didn\'t contain \'A\'!') self.assertEqual(df2.columns[0], 'A', 'column for df2 didn\'t contain \'A\'!') self.assertEqual(df3.columns[0], 'A', 'column for df3 didn\'t contain \'A\'!') self.assertEqual(df4.columns[0], 'A', 'column for df4 didn\'t contain \'A\'!')