def cleaning_and_feature_pipeline(): return Pipeline([ # Feature engineer new columns AddColumnPipe(lambda data: parse_titles(data.Name), 'Title'), AddColumnPipe(lambda data: data.Parch + data.SibSp, 'FamilyCount'), # Drop columns that do not affect the model DropColumnPipe( ['PassengerId', 'Name', 'Cabin', 'Parch', 'SibSp', 'Ticket']), # Turn categorical variables into numeric CategoryToNumericPipe(['Title', 'Sex', 'Embarked']), # Deal with missing data MapColumnPipe(lambda col: Imputer( missing_values='NaN', strategy='median', axis=0).fit_transform( col.values.reshape(-1, 1)), columns=['Age', 'Fare', 'Embarked']), # Group Age into 3 categories VariableToBinPipe(bins=3, columns=['Age']), # Normalize MapColumnPipe(lambda col: MinMaxScaler().fit_transform( col.values.reshape(-1, 1))) ])
def test_flush_dataIsNone_error(self): pipeline = Pipeline([identity]) with self.assertRaises(AssertionError): pipeline.flush(None)
def train_pipeline(): return Pipeline([ # FEATURE ENGINEERING AddColumnPipe( lambda data: data['GarageArea'] + data['GrLivArea'] + data[ '1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF'], 'ExtraArea'), RemoveOutliersPipe(['LotArea', 'ExtraArea']), MapColumnPipe(np.log, columns=['LotArea', 'ExtraArea', 'LotFrontage']), PeekPipe(lambda data: inspect_features(data, ['LotFrontage'])), # FEATURE PICKING DropColumnPipe([ 'Id', 'MSSubClass', 'ExterCond', 'Exterior2nd', 'Utilities', 'GarageCond', 'GarageQual', 'GarageType', 'RoofMatl', 'RoofStyle', 'Heating', 'HeatingQC', 'Street', 'MiscFeature', 'MiscVal', 'BsmtFinType2', 'BsmtHalfBath', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BsmtExposure', 'Fence', 'SaleType', 'Alley', 'LandSlope', 'LandContour', 'Electrical', '3SsnPorch', 'EnclosedPorch', 'PavedDrive', 'Condition2', 'LowQualFinSF', 'Foundation', 'TotRmsAbvGrd', 'PoolQC', 'BedroomAbvGr', 'LotConfig', ]), # MISSING VALUES # Some NaNs actually indicate absence of the feature. MapColumnPipe(lambda col: col.fillna(-1), columns=[ 'LotFrontage', 'MasVnrType', 'MasVnrArea', 'BsmtFinType1', 'FireplaceQu', 'BsmtCond', 'BsmtQual', 'GarageFinish', ]), # Garage build year can be filled with house build year. fill_garage_blt, # Transform features to numeric values CategoryToNumericPipe(['LotArea', 'LotFrontage'], excludes=True), # Impose the remaining NaNs with the mean PeekPipe( lambda data: print(data.isnull().sum()[data.isnull().sum() > 0])), MapColumnPipe( lambda col: Imputer(missing_values='NaN', strategy='mean', axis=0 ).fit_transform(col.values.reshape(-1, 1))), # NORMALIZE MapColumnPipe(lambda col: MinMaxScaler().fit_transform( col.values.reshape(-1, 1))), PeekPipe(lambda data: print(data.shape)) ])
def test_flush_initial_value(self): pipeline = Pipeline([identity, add_testpipe]) self.assertEqual(pipeline.flush(self.data), list(range(11))[1:])
def test_flush_pipes_and_functions(self): pipeline = Pipeline([IdentityPipe(), identity]) self.assertEqual(pipeline.flush(self.data), self.data)
def test_flush_functions(self): pipeline = Pipeline([identity]) self.assertEqual(pipeline.flush(self.data), self.data)
def test_flush_pipes(self): pipeline = Pipeline([IdentityPipe()]) self.assertEqual(pipeline.flush(self.data), self.data)
def fe_pipeline(): return Pipeline([ MapColumnPipe(normalize), #lambda data: umap.UMAP(n_components=2, metric='correlation', verbose=1).fit_transform(data), ])