house_train1 = utils.drop_features(house_train, features_to_drop) house_train1.info() #build pipeline for categorical features categorical_pipeline = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) ]) #build pipeline for numerical features numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()), ('scaler', preprocessing.StandardScaler())]) #build preprocessing pipeline for all features cat_features = utils.get_non_continuous_features(house_train1) num_features = utils.get_continuous_features(house_train1) preprocess_pipeline = compose.ColumnTransformer([ ('cat', categorical_pipeline, cat_features), ('num', numerical_pipeline, num_features) ]) #build complete pipeline with feature selection and ml algorithms complete_pipeline = pipeline.Pipeline([ ('preprocess', preprocess_pipeline), ('zv_filter', feature_selection.VarianceThreshold()), ('feature_selector', feature_selection.SelectFromModel(linear_model.Lasso())), ('pca', decomposition.PCA()), ('regressor', neighbors.KNeighborsRegressor())
path = 'E://' house_train = pd.read_csv(os.path.join(path, "house_train.csv")) house_train.shape house_train.info() house_test = pd.read_csv(os.path.join(path, "house_test.csv")) house_test.shape house_test.info() house = pd.concat((house_train, house_test), axis=0) house.shape house.info() print(utils.get_continuous_features(house)) print(utils.get_non_continuous_features(house)) sns.countplot(x='YearBuilt', data=house_train) sns.jointplot(x="SalePrice", y="YearBuilt", data=house_train) sns.FacetGrid(house_train, hue="YearBuilt", size=8).map(sns.kdeplot, "SalePrice").add_legend() sns.countplot(x='YrSold', data=house_train) sns.jointplot(x="SalePrice", y="YrSold", data=house_train) sns.FacetGrid(house_train, hue="YrSold", size=8).map(sns.kdeplot, "SalePrice").add_legend() features_to_cast = ['MSSubClass'] utils.cast_to_cat(house, features_to_cast) features_to_drop = ['Id']