def test_imputeFeature_value(self): data = self.dfs['nan2'].copy() utdata.imputeFeature(data=data, feature='b', method='value', methodValue=-5) self.assertAlmostEqual(data.loc[2, 'b'], -5)
def test_imputeFeature_regress(self): data = self.dfs['nan3'].copy() utdata.imputeFeature(data=data, feature='b', method='linear', methodExclude=['a']) self.assertAlmostEqual(data.loc[1, 'b'], 1) self.assertAlmostEqual(data.loc[2, 'b'], 2)
def test_imputeFeature_trivial_mean_median_mode(self): for m in ['mean', 'median', 'mode']: with self.subTest(method=m): data = self.dfs['nan1'].copy() utdata.imputeFeature(data=data, feature='a', method=m) pd.testing.assert_frame_equal(data, self.dfs['full'], check_dtype=False)
def _featuresPipeline(data: pd.DataFrame, sibSpCutoff: Union[None, int] = 1, parchCutoff: Union[None, int] = 1, ageImputeMethod: str = 'mean', syntheticFeatures: bool = False) -> pd.DataFrame: """ Data cleaning of features, the full pipeline Args data: DataFrame with features as columns sibSpCutoff: Level to clip SibSp feature parchCutoff: Level to clip Parch feature ageImputeMethod: Method used to impute Age feature: 'mean', 'median', 'mode', 'logistic', 'tree' syntheticFeatures: Add new synthetic features CabinVal and Title Returns DataFrame with transformed features as columns """ dataC = data.copy() # type: pd.DataFrame # -- Embarked, Sex assert 'male' in dataC.Sex.values assert 'female' in dataC.Sex.values assert 'S' in dataC.Embarked.values assert 'C' in dataC.Embarked.values assert 'Q' in dataC.Embarked.values utdata.imputeFeature(dataC, feature='Embarked', method='mode', verbose=False) dataC = pd.get_dummies(dataC, columns=['Embarked', 'Sex'], prefix_sep='', dtype=utdata.CATEGORICAL_TYPE) dataC.drop(columns=['EmbarkedQ', 'Sexmale'], inplace=True) dataC.rename(columns={'Sexfemale': 'Female'}, inplace=True) # -- Cabin, Name, Ticket if syntheticFeatures: dataC['CabinNan'] = dataC['Cabin'].isna().astype( utdata.CATEGORICAL_TYPE) dataC['AgeNan'] = dataC['Age'].isna().astype(utdata.CATEGORICAL_TYPE) titles = dataC.Name.apply(utdata.getTitle) titles[(titles == 'Mlle')] = 'Miss' titles[(titles == 'Mme')] = 'Mrs' titles[(titles != 'Mr') & (titles != 'Miss') & (titles != 'Mrs') & (titles != 'Master')] = 'Rare' dataC['Title'] = titles dataC = pd.get_dummies(dataC, columns=['Title'], prefix_sep='', dtype=utdata.CATEGORICAL_TYPE) dataC.drop(columns=['TitleMr', 'TitleMrs'], inplace=True) dataC.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True) # -- Fare, Age utdata.clipFeature(dataC, 'Fare', nStd=3) utdata.clipFeature(dataC, 'Age', nStd=3) utdata.imputeFeature(dataC, feature='Fare', method='mean', verbose=False) utdata.imputeFeature(dataC, feature='Age', method=ageImputeMethod, methodValue=-100, methodExclude=['Survived', 'EmbarkedS', 'EmbarkedC'], verbose=False) assert dataC.isna().sum().sum() == 0, dataC.isna().sum() # -- SibSp, Parch if sibSpCutoff is not None: dataC.loc[dataC['SibSp'] > sibSpCutoff, 'SibSp'] = sibSpCutoff if parchCutoff is not None: dataC.loc[dataC['Parch'] > parchCutoff, 'Parch'] = parchCutoff # dataC.Survived = dataC.Survived.astype(CATEGORICAL_TYPE) return dataC
def test_imputeFeature_median(self): data = self.dfs['nan2'].copy() utdata.imputeFeature(data=data, feature='b', method='median') self.assertAlmostEqual(data.loc[2, 'b'], 0.)