def run(self): [ utils.create_folder(self.output()[x].path) for x in self.output().keys() ] # Read in X and y X_train = utils.load_data( self.input()['select_features']['X_train_filtered'].path) y_train = utils.load_data(self.input()['prepare_features']['y'].path) if not self.model['estimators']: raise Exception("Please provide list of estimators to train!") # Iterate over our models and our offset targets, performing the grid search to tune hyper-parameters for model in self.model['estimators']: LOGGER.info('{}: Tuning model - {}'.format(repr(self), model["estimator"])) grid_search = self.do_grid_search(model, X_train, y_train) self.best_model_per_model_type(model, grid_search) # Save best models for best_model in self.best_estimator_per_model: LOGGER.info('{}: BEST {} - {}'.format( repr(self), best_model["model"]["estimator_type"], str(best_model["best_score"]))) utils.save_data(self.best_estimator_per_model, self.output()['model_package'].path)
def run(self): [ utils.create_folder(self.output()[x].path) for x in self.output().keys() ] final_model_package = {} if len(self.model["estimators"]) > 1: # Read best models best_models = utils.load_data( self.input()["cv"]["model_package"].path) # Read in X and y X_train = utils.load_data( self.input()['select_features']['X_train_filtered'].path) y_train = utils.load_data( self.input()['prepare_features']['y'].path) estimators = [] for model in best_models: estimators.append( (model["model"]["estimator_type"], model["best_model"])) eclf = VotingClassifier(estimators=estimators, voting="soft") LOGGER.info('{}: Fitting ensemble model '.format(repr(self))) eclf.fit(X_train, y_train) # Package model final_model_package["final_model"] = eclf # Save ensemble model utils.save_data(final_model_package, self.output()["ensemble_model"].path)
def run(self): [utils.create_folder(self.output()[x].path) for x in self.output().keys()] # Load data training_data = utils.load_data("data/train.csv") test_data = utils.load_data("data/test.csv") # Combine test and train for data transformation # This may not be the best strategy in many real world applications as we are imputing our data with information # from test, but for the purpose of improving kaggle score, let's add test here for a better imputation combined = training_data.append(test_data, ignore_index=True) combined.drop(columns=["Survived"], inplace=True) # Fit and transform raw features LOGGER.info('{}: Transforming raw features'.format(repr(self))) transformer = TitanicFeatureTransformer() combined_transformed = transformer.fit_transform(combined) # Split back into train and test features X_train = combined_transformed[:len(training_data)] X_test = combined_transformed[len(training_data):] y = training_data["Survived"] # Save LOGGER.info('{}: Saving transformer and transformed features. {} rows of train data, ' '{} rows of test data, {} columns.'.format(repr(self), str(len(X_train)), str(len(X_test)), str(X_train.shape[1]))) utils.save_data(transformer, self.output()["transformer"].path) utils.save_data(X_train, self.output()["X_train"].path) utils.save_data(X_test, self.output()["X_test"].path) utils.save_data(y, self.output()["y"].path)
def run(self): [utils.create_folder(self.output()[x].path) for x in self.output().keys()] # Load data X_train = utils.load_data(self.input()["prepare_features"]["X_train"].path) X_test = utils.load_data(self.input()["prepare_features"]["X_test"].path) y = utils.load_data(self.input()["prepare_features"]["y"].path) transformer = utils.load_data(self.input()["prepare_features"]["transformer"].path) # Feature selection LOGGER.info('{}: Selecting features'.format(repr(self))) feature_selection_clf = utils.import_object(self.model["feature_selection"]["estimator"])() feature_selection_clf.set_params(**self.model["feature_selection"]["parameter_values"]) feature_selection_model = SelectFromModel(feature_selection_clf) feature_selection_model.fit(X_train, y) X_train_filtered = feature_selection_model.transform(X_train) X_test_filtered = feature_selection_model.transform(X_test) # Save LOGGER.info('{}: Saving feature selection model and selected features. {} columns of data ' 'selected.'.format(repr(self), str(X_train_filtered.shape[1]))) columns = transformer.get_column_order() dropped_columns = feature_selection_model.get_support() columns_selected = list(compress(columns, list(dropped_columns))) LOGGER.info('{}: Selected columns: {}'.format(repr(self), ",".join(columns_selected))) utils.save_data(X_train_filtered, self.output()["X_train_filtered"].path) utils.save_data(X_test_filtered, self.output()["X_test_filtered"].path) utils.save_data(feature_selection_model, self.output()["feature_selection"].path)
def run(self): [ utils.create_folder(self.output()[x].path) for x in self.output().keys() ] # Read prediction data pred_data = utils.load_data("data/test.csv") predict_folder = os.path.dirname(self.output()["predictions"].path) # Read models and transform data best_individual_models = utils.load_data( self.input()["cv"]["model_package"].path) final_model = utils.load_data( self.input()["ensemble_clf"]["ensemble_model"].path) X_test_filtered = utils.load_data( self.input()["select_features"]["X_test_filtered"].path) for m in best_individual_models: clf = m["best_model"] prediction_df = self.make_prediction(clf, X_test_filtered, pred_data["PassengerId"]) utils.save_data( prediction_df, os.path.join( predict_folder, m["model"]["estimator_type"] + "_" + str(m["best_score"]) + ".csv")) if len(self.model["estimators"]) > 1: eclf = final_model["final_model"] prediction_df = self.make_prediction(eclf, X_test_filtered, pred_data["PassengerId"]) utils.save_data( prediction_df, os.path.join(predict_folder, "EnsembleClassifier.csv")) utils.save_data("", self.output()["predictions"].path)
def run(self): [utils.create_folder(self.output()[x].path) for x in self.output().keys()] utils.save_data(self.model["build_description"], self.output()["log_name"].path)