def transform(self, X, y=None): if isinstance(X, dict): for col, col_dict in self.column_ranges.items(): if col in X: X[col] = scale_val(val=X[col], min_val=col_dict['min_val'], total_range=col_dict['inner_range'], truncate_large_values=self.truncate_large_values) else: if len(self.cols_to_ignore) > 0: X = utils.safely_drop_columns(X, self.cols_to_ignore) # X = X.drop(self.cols_to_ignore, axis=1) for col, col_dict in self.column_ranges.items(): if col in X.columns: min_val = col_dict['min_val'] inner_range = col_dict['inner_range'] X[col] = X[col].apply(lambda x: scale_val(x, min_val, inner_range, self.truncate_large_values)) return X
def train(self, raw_training_data, user_input_func=None, optimize_entire_pipeline=False, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, print_training_summary_to_viewer=True, ml_for_analytics=True, only_analytics=False, compute_power=3, take_log_of_y=None, model_names=None, perform_feature_scaling=True, ensembler=None): self.user_input_func = user_input_func self.optimize_final_model = optimize_final_model self.optimize_entire_pipeline = optimize_entire_pipeline self.write_gs_param_results_to_file = write_gs_param_results_to_file self.compute_power = compute_power self.ml_for_analytics = ml_for_analytics self.only_analytics = only_analytics self.X_test = X_test self.y_test = y_test self.print_training_summary_to_viewer = print_training_summary_to_viewer if self.type_of_estimator == 'regressor': self.take_log_of_y = take_log_of_y self.model_names = model_names self.perform_feature_scaling = perform_feature_scaling self.ensembler = ensembler if verbose: print( 'Welcome to auto_ml! We\'re about to go through and make sense of your data using machine learning' ) # We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here. if isinstance(raw_training_data, list): X_df = pd.DataFrame(raw_training_data) del raw_training_data else: X_df = raw_training_data if len(X_df.columns) < 50 and perform_feature_selection != True: perform_feature_selection = False else: perform_feature_selection = True self.perform_feature_selection = perform_feature_selection # To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored if len(self.cols_to_ignore) > 0: X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore) X_df, y = self._prepare_for_training(X_df) self.X_df = X_df self.y = y if self.take_log_of_y: y = [math.log(val) for val in y] self.took_log_of_y = True if verbose: print( 'Successfully performed basic preparations and y-value cleaning' ) if model_names != None: estimator_names = model_names else: estimator_names = self._get_estimator_names() if self.type_of_estimator == 'classifier': if len(set(y)) > 2: scoring = accuracy_score else: scoring = utils_scoring.brier_score_loss_wrapper self._scorer = scoring else: scoring = utils_scoring.rmse_scoring self._scorer = scoring if verbose: print('Created estimator_names and scoring') self.perform_grid_search_by_model_names(estimator_names, scoring, X_df, y) # If we ran GridSearchCV, we will have to pick the best model # If we did not, the best trained pipeline will already be saved in self.trained_pipeline if self.fit_grid_search and len(self.grid_search_pipelines) > 1: # Once we have trained all the pipelines, select the best one based on it's performance on (top priority first): # 1. Holdout data # 2. CV data # First, sort all of the tuples that hold our scores in their first position(s), and our actual trained pipeline in their final position # Since a more positive score is better, we want to make sure that the first item in our sorted list is the highest score, thus, reverse=True sorted_gs_pipeline_results = sorted(self.grid_search_pipelines, key=lambda x: x[0], reverse=True) # Next, grab the thing at position 0 in our sorted list, which is itself a list of the scores(s), and the pipeline itself best_result_list = sorted_gs_pipeline_results[0] # Our best grid search result is the thing at the end of that list. best_trained_gs = best_result_list[-1] # And the pipeline is the best estimator within that grid search object. self.trained_pipeline = best_trained_gs.best_estimator_ # Delete values that we no longer need that are just taking up space. del self.X_test del self.y_test del self.grid_search_pipelines del X_df
def _prepare_for_training(self, X): # We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here. if isinstance(X, list): X_df = pd.DataFrame(X) del X else: X_df = X # To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored if len(self.cols_to_ignore) > 0: X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore) # Having duplicate columns can really screw things up later. Remove them here, with user logging to tell them what we're doing X_df = utils.drop_duplicate_columns(X_df) # If we're writing training results to file, create the new empty file name here if self.write_gs_param_results_to_file: self.gs_param_file_name = 'most_recent_pipeline_grid_search_result.csv' try: os.remove(self.gs_param_file_name) except: pass # bad_rows = X_df[pd.isnull(X_df[self.output_column])] # if bad_rows.shape[0] > 0: # print('We encountered a number of missing values for this output column') # print('Specifically, here is the output column:') # print(self.output_column) # print('And here is the number of missing (nan, None, etc.) values for this column:') # print(bad_rows.shape[0]) # print('We will remove these values, and continue with training on the cleaned dataset') # X_df = X_df.dropna(subset=[self.output_column]) # Remove the output column from the dataset, and store it into the y varaible y = list(X_df.pop(self.output_column)) # Drop all rows that have an empty value for our output column # User logging so they can adjust if they pass in a bunch of bad values: X_df, y = utils.drop_missing_y_vals(X_df, y, self.output_column) # If this is a classifier, try to turn all the y values into proper ints # Some classifiers play more nicely if you give them category labels as ints rather than strings, so we'll make our jobs easier here if we can. if self.type_of_estimator == 'classifier': # The entire column must be turned into floats. If any value fails, don't convert anything in the column to floats try: y_ints = [] for val in y: y_ints.append(int(val)) y = y_ints except: pass else: # If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats indices_to_delete = [] y_floats = [] bad_vals = [] for idx, val in enumerate(y): try: float_val = utils_data_cleaning.clean_val(val) y_floats.append(float_val) except ValueError as err: indices_to_delete.append(idx) bad_vals.append(val) y = y_floats # Even more verbose logging here since these values are not just missing, they're strings for a regression problem if len(indices_to_delete) > 0: print( 'The y values given included some bad values that the machine learning algorithms will not be able to train on.' ) print( 'The rows at these indices have been deleted because their y value could not be turned into a float:' ) print(indices_to_delete) print('These were the bad values') print(bad_vals) X_df = X_df.drop(X_df.index(indices_to_delete)) return X_df, y