def train_and_pickle_best_model(target, X, y, val_X, val_y): print('AutoML Search for good model for {}'.format(target)) pipeline_optimizer = TPOTRegressor( generations=10, population_size=150, cv=3, random_state=0xDEADBEEF, verbosity=3, scoring='r2', n_jobs=-1, early_stop=5, periodic_checkpoint_folder='tpot_checkpoint') pipeline_optimizer.fit(X, y) new_preds = pipeline_optimizer.predict(val_X) mae = mean_absolute_error(val_y, new_preds) rmse = sqrt(mean_squared_error(val_y, new_preds)) r2 = r2_score(val_y, new_preds) print("TPOT mae:", mae) print("TPOT rmse:", rmse) print("TPOT R^2 score:", r2) pipeline_optimizer.export( 'models/tpot_exported_pipeline_{}.py'.format(target)) dump(pipeline_optimizer.fitted_pipeline_, 'models/{}-best-model-automl.joblib'.format(target)) return r2, mae, rmse
def tpot_regression(x_calib, y_calib, x_prod, y_prod, results_direct, cv_folds, error_metric, num_jobs, gens, pop, mins, mins_per_pipeline, verbose, early_stop_generations, tpot_config_dict, model_name='tpot_best'): checkpoint_folder = results_direct + 'checkpoint_folder/' if not Path(checkpoint_folder).is_dir(): os.mkdir(checkpoint_folder) ml_model = TPOTRegressor(generations=gens, population_size=pop, scoring=error_metric, max_time_mins=mins, cv=cv_folds, verbosity=verbose, n_jobs=num_jobs, early_stop=early_stop_generations, max_eval_time_mins=mins_per_pipeline, config_dict=tpot_config_dict, periodic_checkpoint_folder=checkpoint_folder) ml_model.fit(x_calib, y_calib) # save entire pipeline ml_model.export(results_direct + model_name + '.py') joblib.dump(ml_model.fitted_pipeline_, results_direct + model_name + '.sav') # for cross valdation errors see the exported model py file # production - results and errors y_prod_predict = ml_model.predict(x_prod) np.save(results_direct + model_name + '_prod_predicted.npy', y_prod_predict) df_prod_errors = pd.DataFrame(index=[ 'Mean Squared Error', 'Median Absolute Error', 'Correlation Coefficient', 'R2' ]) df_prod_errors['TPOT Best'] = [ mean_squared_error(y_prod, y_prod_predict), median_absolute_error(y_prod, y_prod_predict), np.corrcoef(y_prod, y_prod_predict)[0][-1], r2_score(y_prod, y_prod_predict) ] df_prod_errors.to_csv(results_direct + model_name + '_prod_errors.csv')
def build_regressor(data, name): X, y = data config = make_tpot_pmml_config(regressor_config_dict) del config["sklearn.neighbors.KNeighborsRegressor"] regressor = TPOTRegressor(generations = 3, population_size = 3, random_state = 13, config_dict = config, verbosity = 2) regressor.fit(X, y) pipeline = make_pmml_pipeline(regressor.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(regressor.predict(X), columns = [y.name]) store_csv(result, name)
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name, target_name): tp = TPOTRegressor(verbosity=2) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({id_name: id_test, target_name: preds}) submission.to_csv('submission_' + name_dataset + '_' + 'tpot.csv', index=False)
def model_selection_and_HPO(dataframe, target="job_performance", test_size=0.25, r_seed=123): """ Pass in the dataframe that has gone through feature selection Uses the TPOT regressor module from TPOT to perform MS and HPO. As this modeling uses some element of stochasticity, it may provide different results every time. The longer you run this, the more similar the final models will look like in the end. Finally outputs a .py file with the selected model and its hyperparameters, for which we can import. """ import TPOT from sklearn.model_selection import train_test_split import timeit from tpot import TPOTRegressor from sklearn.metrics import ( confusion_matrix, roc_auc_score, precision_recall_fscore_support, accuracy_score, ) # train test split X_train, X_test, y_train, y_test = train_test_split( dataframe.loc[:, dataframe.columns != target].values, dataframe[target].values.ravel(), test_size=test_size, random_state=r_seed) y_train = y_train.ravel() y_test = y_test.ravel() # model selection and hyperparameter optimization with TPOT Regressor tpot_regressor = TPOTRegressor(generations=20, population_size=50, cv=10, random_state=r_seed, verbosity=2, memory='auto') start_time = timeit.default_timer() tpot_regressor.fit(X_train, y_train) y_pred = tpot_regressor.predict(X_test) end_time = timeit.default_timer() print(f"Total runtime for the Employee dataset: {end_time-start_time}s") print("TPOT Score: {}".format(tpot_regressor.score(X_test, y_test))) tpot_regressor.export('tpot_exported_pipeline.py')
def build_regressor(data, feature_pipeline, generations, population_size, name): X, y = data Xt = feature_pipeline.fit_transform(X) Xt = Xt.astype(float) config = make_tpot_pmml_config(regressor_config_dict) config = filter_config(config) del config["sklearn.neighbors.KNeighborsRegressor"] regressor = TPOTRegressor(generations=generations, population_size=population_size, random_state=13, config_dict=config, verbosity=2) regressor.fit(Xt, y) pipeline = Pipeline(steps=feature_pipeline.steps + regressor.fitted_pipeline_.steps) pipeline = make_pmml_pipeline(pipeline, active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(regressor.predict(Xt), columns=[y.name]) store_csv(result, name)
def callback(self, channel, method, properties, body): with self.lock: (symbol, X_train, X_test, y_train, y_test, folds_index) = decode_data(body) channel.basic_ack(delivery_tag=method.delivery_tag) logger.info("data received %s %d", symbol, folds_index) tpot = TPOTRegressor(memory='auto', generations=100, population_size=100, n_jobs=-1, max_time_mins=20, max_eval_time_mins=20, config_dict='TPOT light') try: tpot.fit(X_train, y_train) except Exception as e: logger.error(e) data = (None, None, None, None) with self.lock: channel.basic_publish(exchange='', routing_key='tpot_pipelines', body=encode_data(data)) return test_prediction = tpot.predict(X_test) test_prediction_error = abs((y_test - test_prediction) * 100 / y_test) score = tpot.score(X_test, y_test) logger.info("sending result of %s %s", symbol, folds_index) try: data = (tpot.fitted_pipeline_, score, folds_index, symbol) with self.lock: channel.basic_publish(exchange='', routing_key='tpot_pipelines', body=encode_data(data)) except Exception: import pdb pdb.set_trace()
max_time_mins=5, scoring='r2', verbosity=3, n_jobs=4) tpotReg2 = TPOTRegressor(generations=50, population_size=50, max_time_mins=5, scoring='r2', verbosity=3, n_jobs=4) tpotReg1.fit(X_train, y_train1) tpotReg2.fit(X_train, y_train2) y_pred1 = tpotReg1.predict(X_test) y_pred2 = tpotReg2.predict(X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = tpotReg1.predict(X_train) y_pred2 = tpotReg1.predict(X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_train, y_pred=y_pred) metrics = getMetrics(y_true=y_train, y_pred=y_pred) tpotReg1.export('tpot_pipeline1.py') tpotReg2.export('tpot_pipeline2.py')
y_train = y_train.ravel() y_test = y_test.ravel() ## TPOT Model Performance tpot_regressor_pipeline_selector = TPOTRegressor( generations=20, population_size=50, offspring_size=None, cv=10, random_state=42, verbosity=2, memory="auto", warm_start=True, use_dask=False, periodic_checkpoint_folder=PERIODIC_CHECKPOINT_FOLDER, ) tpot_regressor_pipeline_selector.fit(X_train, y_train) y_pred = tpot_regressor_pipeline_selector.predict(X_test) def save_best_pipeline(selected_pipeline, filename): selected_pipeline.export( os.path.join(PERIODIC_CHECKPOINT_FOLDER, f"{filename}.py")) # tpot_regressor_pipeline_selector.export(os.path.join(PERIODIC_CHECKPOINT_FOLDER,'tpot_exported_pipeline.py')) save_best_pipeline(tpot_regressor_pipeline_selector, "tpot_exported_pipeline")
def scoring(y_real, y_predicted): return sum(y_predicted)[-1] / (len(y_predicted) - 1) for i in range(10): print('#' * 80) print(f'# GENERATION {i + 1}') print('#' * 80) x = np.array(walker.state_history[:-1]) y = np.array([ list(a) + [r] for a, r in zip(walker.action_history, walker.reward_history) ]) walker.save_history(f'sillywalker{i+1}') model = TPOTRegressor(generations=5, population_size=20, scoring=scoring, verbosity=2, config_dict=regressor_config_dict_light) model.fit(x, y) for _ in range(10): while not walker.done: s = walker.state prediction = model.predict(np.array([s]))[0] print(prediction) action = Action(*prediction[:-1]) walker.step(action) walker.reset()
print(TDmodel_gmd_F.score(gmd_X_td_F, gmd_y_td_F)) # PS R2 gmd_df_ps_F = PS_data_F gmd_df_ps_F = gmd_df_ps_F.dropna() gmd_X_ps_F = gmd_df_ps_F.drop(['ageAtScan1', 'goassessDxpmr7', 'sex'], axis=1) for column in gmd_X_ps_F: pd.to_numeric(gmd_X_ps_F[column], errors='coerce') gmd_y_ps_F = gmd_df_ps_F.ageAtScan1 pd.to_numeric(gmd_y_ps_F, errors='coerce') print(TDmodel_gmd_F.score(gmd_X_ps_F, gmd_y_ps_F)) # Create new columns in dataframe # --- TD # 1) real and predicted gmd_df_td_F['pred_age'] = TDmodel_gmd_F.predict(gmd_X_td_F) real_age_td_F = gmd_df_td_F.ageAtScan1 pred_age_td_F = TDmodel_gmd_F.predict(gmd_X_td_F) gmd_df_td_F['diff_real_pred_age'] = real_age_td_F - pred_age_td_F gmd_df_td_F['real_over18'] = gmd_df_td_F.ageAtScan1 >= 216 gmd_df_td_F['pred_over18'] = gmd_df_td_F.pred_age >= 216 # 2) age group indicators gmd_df_td_F['8_9'] = ((gmd_df_td_F.ageAtScan1 >= 96) & (gmd_df_td_F.ageAtScan1 < 120)) gmd_df_td_F['10_11'] = ((gmd_df_td_F.ageAtScan1 >= 120) & (gmd_df_td_F.ageAtScan1 < 144)) gmd_df_td_F['12_13'] = ((gmd_df_td_F.ageAtScan1 >= 144) & (gmd_df_td_F.ageAtScan1 < 168)) gmd_df_td_F['14_15'] = ((gmd_df_td_F.ageAtScan1 >= 168) & (gmd_df_td_F.ageAtScan1 < 192)) gmd_df_td_F['16_17'] = ((gmd_df_td_F.ageAtScan1 >= 192) &
def model_dev(train_set,matchups,spreads): """ Create the testing set for the algo creation """ # Create a sample set to pass into the machine learning algorithm X = train_set[['rush_attempt_diff', 'turn_diff', 'yards_diff', 'third_diff', 'sack_diff', 'sack_ydiff', 'poss_diff', 'p_attempt_diff']].copy() # X = df[['poss_diff', 'third_diff', 'turn_diff', 'pass_diff', 'rush_diff']].copy() # Create results vector (a home win = 1, a home loss or tie = 0) train_set.rename(columns={'result_spread':'class'},inplace=True) y = train_set['class']#np.array(np.where(df['home_score'] > df['away_score'], 1, 0)) """ Train, test, and predict the algorithm """ # Scale the sample data scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Delete the dataframe to clear memory del train_set # Split out training and testing data sets X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.25,random_state=0) # alphas = [0.1, 0.3, 0.9, 1.0, 1.3, 1.9, 2.0, 2.3, 2.9] # for alpha in alphas: # reg = linear_model.Ridge(alpha = alpha) # reg.fit(X_train,y_train) # print 'alpha = ',alpha,', score = ',reg.score(X_test,y_test) # input() pipeline_optimizer = TPOTRegressor(generations = 5, population_size = 10, random_state = 42, cv = 5, verbosity = 2, n_jobs = 3)#, scoring = 'f1') pipeline_optimizer.fit(X_train,y_train) print pipeline_optimizer.score(X_test,y_test) pipeline_optimizer.export('NFL_ML_TPOT_Regressor.py') # Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True) """ for feat in range(1,len(matchups.columns)): for c in C_vec: # Create the classifier and check the score # clf = LogisticRegression() clf = linear_model.LogisticRegression(C=c,random_state=42) selector = RFE(clf) selector = selector.fit(X_train,y_train) # Calculate probabilities using the predict_proba method for logistic regression probabilities = selector.predict_proba(scaler.transform(matchups)) # Vectorize the spread_conversion function and apply the function to the probabilities result vector vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0]) # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) # Create the actual result vector where a tie counts as a loss for the home team game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0)) # Check to see where the bet_vector equals the actual game result with the spread included result = np.array(np.where(bet_vector == game_result,1,0)) prob_result = float(np.sum(result)) / len(result) # print 'Number of features =', feat, 'C =',c,' Percent correct =',prob_result if prob_result > prob_val: prob_val = prob_result C_val = c feat_val = feat print 'Score =',selector.score(X_test,y_test) # print prob_val, C_val, feat clf = linear_model.LogisticRegression(C=C_val,random_state=42) clf = clf.fit(X_train,y_train) probabilities = clf.predict_proba(scaler.transform(matchups)) vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0]) """ predicted_spreads = pd.DataFrame(pipeline_optimizer.predict(scaler.transform(matchups)),columns = ['results']) bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) print spreads print predicted_spreads print bet_vector
y = df.pop('progression') X = df #y.head() #split training and test data. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #specify model regr = linear_model.LinearRegression() regr = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=-1) #regr = linear_model.Ridge() #regr = linear_model.Lasso() #train the model using all data #regr.fit(X, y) # Train the model using the training sets regr.fit(X_train, y_train) #Explained variance score: 1 is perfect prediction regr.score(X, y) regr.score(X_train, y_train) regr.score(X_test, y_test) #Generate predictions, then append to df, then write to Excel results = X_test y_pred = regr.predict(X_test) results['progression'] = y_test results['pred_progression'] = y_pred results.to_excel(r'diabetes.xls', header=True, index=True)
class TPOTGaussianAdsorptionDiscoverer(AdsorptionDiscovererBase): ''' This discoverer uses a Gaussian selection method with a TPOT model to select new sampling points. ...sorry for the awful code. This is a hack-job and I know it. ''' # The width of the Gaussian selection curve stdev = 0.1 def _train(self): ''' Calculate the residuals of the current training batch, then retrain on everything ''' # Instantiate the preprocessor and TPOT if we haven't done so already if not hasattr(self, 'preprocessor'): self._train_preprocessor() if not hasattr(self, 'tpot'): self.tpot = TPOTRegressor(generations=2, population_size=32, offspring_size=32, verbosity=2, scoring='neg_median_absolute_error', n_jobs=16, warm_start=True) features = self.preprocessor.transform(self.training_batch) energies = [doc['energy'] for doc in self.training_batch] self.tpot.fit(features, energies) # Calculate and save the residuals of this next batch features = self.preprocessor.transform(self.training_batch) tpot_predictions = self.tpot.predict(features) dft_energies = np.array([doc['energy'] for doc in self.training_batch]) residuals = tpot_predictions - dft_energies self.residuals.extend(list(residuals)) # Retrain self.training_set.extend(self.training_batch) self.__train_tpot() def _train_preprocessor(self): ''' Trains the preprocessing pipeline and assigns it to the `preprocessor` attribute. ''' # Open the cached preprocessor try: cache_name = 'caches/preprocessor.pkl' with open(cache_name, 'rb') as file_handle: self.preprocessor = pickle.load(file_handle) # If there is no cache, then remake it except FileNotFoundError: inner_fingerprinter = fingerprinters.InnerShellFingerprinter() outer_fingerprinter = fingerprinters.OuterShellFingerprinter() fingerprinter = fingerprinters.StackedFingerprinter( inner_fingerprinter, outer_fingerprinter) scaler = StandardScaler() pca = PCA() preprocessing_pipeline = Pipeline([ ('fingerprinter', fingerprinter), ('scaler', scaler), ('pca', pca) ]) preprocessing_pipeline.fit(self.training_batch) self.preprocessor = preprocessing_pipeline # Cache it for next time with open(cache_name, 'wb') as file_handle: pickle.dump(preprocessing_pipeline, file_handle) def __train_tpot(self): ''' Train TPOT using the `training_set` attached to the class ''' # Cache the current point for (manual) warm-starts, because there's a # solid chance that TPOT might cause a segmentation fault. cache_name = 'caches/%.3i_discovery_cache.pkl' % self.next_batch_number with open(cache_name, 'wb') as file_handle: cache = { 'training_set': self.training_set, 'sampling_space': self.sampling_space, 'residuals': self.residuals, 'regret_history': self.regret_history, 'next_batch_number': self.next_batch_number, 'training_batch': self.training_batch } pickle.dump(cache, file_handle) # Instantiate the preprocessor and TPOT if we haven't done so already if not hasattr(self, 'preprocessor'): self._train_preprocessor() if not hasattr(self, 'tpot'): self.tpot = TPOTRegressor(generations=2, population_size=32, offspring_size=32, verbosity=2, scoring='neg_median_absolute_error', n_jobs=16, warm_start=True) # [Re-]train features = self.preprocessor.transform(self.training_set) energies = [doc['energy'] for doc in self.training_set] self.tpot.fit(features, energies) self.next_batch_number += 1 # Try to address some memory issues by collecting garbage _ = gc.collect() # noqa: F841 def _choose_next_batch(self): ''' Choose the next batch "randomly", where the probability of selecting sites are weighted using a combination of a Gaussian distribution and TPOT's prediction of their distance from the optimal energy. Snippets were stolen from the GASpy_feedback module. ''' # Use the energies to calculate probabilities of selecting each site features = self.preprocessor.transform(self.sampling_space) energies = self.tpot.predict(features) gaussian_distribution = norm(loc=self.optimal_value, scale=self.stdev) probability_densities = [ gaussian_distribution.pdf(energy) for energy in energies ] # Perform a weighted shuffling of the sampling space such that sites # with better energies are more likely to be early in the list self.sampling_space = self.weighted_shuffle(self.sampling_space, probability_densities) self._pop_next_batch @staticmethod def weighted_shuffle(sequence, weights): ''' This function will shuffle a sequence using weights to increase the chances of putting higher-weighted elements earlier in the list. Credit goes to Nicky Van Foreest, whose function I based this off of. Args: sequence A sequence of elements that you want shuffled weights A sequence that is the same length as the `sequence` that contains the corresponding probability weights for selecting/choosing each element in `sequence` Returns: shuffled_list A list whose elements are identical to those in the `sequence` argument, but randomly shuffled such that the elements with higher weights are more likely to be in the front/start of the list. ''' shuffled_list = np.empty_like(sequence) # Pack the elements in the sequences and their respective weights pairings = list(zip(sequence, weights)) for i in range(len(pairings)): # Randomly choose one of the elements, and get the corresponding index cumulative_weights = np.cumsum([weight for _, weight in pairings]) rand = random.random() * cumulative_weights[-1] j = bisect_right(cumulative_weights, rand) # Pop the element out so we don't re-select try: shuffled_list[i], _ = pairings.pop(j) # Hack a quick fix to some errors I don't feel like solving except IndexError: try: shuffled_list[i], _ = pairings.pop(-1) except IndexError: break return shuffled_list.tolist()
# Data Extraction df = data_extract_e('e_20190609_15.pkl') # Data Transformation and Engineering df = feature_eng(df) df = extract_queues(df) dept_encoder, queue_encoder = fit_labels(df) df = feature_transform(df, queue_encoder, dept_encoder) # Training/Test Split x, y = data_filter(df) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2468) # Using TPOT AutoML tpot = TPOTRegressor(n_jobs=-1, verbosity=1, config_dict=xgb_config.xgb_config_dict) tpot = tpot.fit(x_train, y_train) y_pred = tpot.predict(x_train) print('XGB TPOT training R2 score: ', r2_score(y_train, y_pred)) print('XGB TPOT training negative MSE: ', tpot.score(x_train, y_train)) y_pred = tpot.predict(x_test) print('XGB TPOT test R2 score: ', r2_score(y_test, y_pred)) print('XGB TPOT test negative MSE: ', tpot.score(x_test, y_test)) tpot.export('xgb_tpot.py')
auto_classifier.fit(X_train, y_train) # In[ ]: #print("The cross-validation MSE") #print(auto_classifier.score(X_valid, y_valid)) # In[ ]: # Now do the prediction test_result = auto_classifier.predict(test[feature_names].values) sub = pd.DataFrame() sub['id'] = test['id'] sub['trip_duration'] = np.exp(test_result) sub.to_csv('NYCTaxi_TpotModels.csv', index=False) sub.head() # In[ ]: # Export the model auto_classifier.export('NYCTaxi_pipeline.py') # That is it for now. You can run locally with more number of generations, population, etc. to get a better result. Because of Kaggle time limitations I could not choose parameters that take longer to run.
test = combi[train.shape[0]:] test.drop('Item_Outlet_Sales',axis=1,inplace=True) ## removing id variables tpot_train = train.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1) tpot_test = test.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1) target = tpot_train['Item_Outlet_Sales'] tpot_train.drop('Item_Outlet_Sales',axis=1,inplace=True) # finally building model using tpot library from tpot import TPOTRegressor X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export(data+'tpot_boston_pipeline.py') ## predicting using tpot optimised pipeline tpot_pred = tpot.predict(tpot_test) sub1 = pd.DataFrame(data=tpot_pred) #sub1.index = np.arange(0, len(test)+1) sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'}) sub1['Item_Identifier'] = test['Item_Identifier'] sub1['Outlet_Identifier'] = test['Outlet_Identifier'] sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier'] sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']] sub1.to_csv('tpot.csv',index=False)
plt.show() # %% tpot testSL = to_supervised(test, n_input, n_outputs) trainSL = to_supervised(train, n_input, n_outputs) testSL[0].shape = (testSL[0].shape[0], testSL[0].shape[1] * testSL[0].shape[2]) trainSL[0].shape = ( trainSL[0].shape[0], trainSL[0].shape[1] * trainSL[0].shape[2], ) (X_train, y_train) = trainSL (X_test, y_test) = testSL tpot = TPOTRegressor(generations=20, population_size=100, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export("tpot_boston_pipeline.py") # %% Plot predictions = tpot.predict(X_test) plt.plot(np.squeeze(predictions), label="Predictions") plt.plot(np.array(test)[-1 * predictions.shape[0]:][:, 0], label="dlGDP_csa") plt.title("dlGDP forecasts") plt.ylabel("dlGDP_csa") plt.xlabel("Quarter") plt.legend(loc="upper left") plt.show()
# y_test_pred = reg.predict(X_test).squeeze() # print(reg.evaluate(X, y)) # print(reg.evaluate(X_test, y_test)) # TPOT from tpot import TPOTRegressor tpot = TPOTRegressor( # scoring=None, use_dask=True, generations=5, population_size=50, n_jobs=4, verbosity=2, random_state=42, ) tpot.fit(X, y) y_pred = tpot.predict(X) y_test_pred = tpot.predict(X_test) print(tpot.score(X, y)) print(tpot.score(X_test, y_test)) y_pred_ss = pd.Series(y_pred, index=train_data.index[idx_good]) strategy_train = train_data[selected_factor_names + ['hmo_s1']].assign(y_pred=y_pred_ss) y_test_pred_ss = pd.Series(y_test_pred, index=test_data.index[idx_test_good]) strategy_test = test_data[selected_factor_names + ['hmo_s1']].assign(y_pred=y_test_pred_ss) test_date0 = test_date[0] strategy = pd.concat([strategy_train, strategy_test]) # strategy = strategy_train.copy()
y_train_all = train['y'] del train['ID'] del train['y'] id_test = test['ID'] del test['ID'] print 'train:', train.shape, ', test:', test.shape random_state = 42 X_train, X_val, y_train, y_val = train_test_split(train, y_train_all, test_size=0.2, random_state=random_state) pipeline_optimizer = TPOTRegressor(generations=5, population_size=100, offspring_size=None, scoring='r2', cv=5, subsample=0.95, n_jobs=1, random_state=random_state, verbosity=2) pipeline_optimizer.fit(X_train.values, y_train.values) print(pipeline_optimizer.score(X_val.values, y_val.values)) pipeline_optimizer.export('./tpot_exported_models/tpot_exported_pipeline.py') predict_y = pipeline_optimizer.predict(test.values) df_sub = pd.DataFrame({'ID': id_test, 'y': predict_y}) df_sub.to_csv('tpot_pipeline_result.csv', index=False)
class TPOT(BaseModel): ''' This is our wrapper for fingerprinting sites and then using TPOT to predict adsorption energies from those fingerprints. ''' def __init__(self): ''' Instantiate the preprocessing pipeline and the TPOT model ''' # Instantiate the fingerprinter inner_fingerprinter = fingerprinters.InnerShellFingerprinter() outer_fingerprinter = fingerprinters.OuterShellFingerprinter() fingerprinter = fingerprinters.StackedFingerprinter(inner_fingerprinter, outer_fingerprinter) scaler = StandardScaler() pca = PCA() preprocessing_pipeline = Pipeline([('fingerprinter', fingerprinter), ('scaler', scaler), ('pca', pca)]) self.preprocessor = preprocessing_pipeline # Instantiate TPOT self.tpot = TPOTRegressor(generations=2, population_size=32, offspring_size=32, verbosity=2, scoring='neg_median_absolute_error', n_jobs=16, warm_start=True) def train(self, docs, energies): ''' Trains both the preprocessor and TPOT in series Args: docs List of dictionaries from `gaspy.gasdb.get_adsorption_docs` energies List of floats containing the adsorption energies of `docs` ''' features = self.preprocessor.fit_transform(docs) self.tpot.fit(features, energies) # Try to address some memory issues by collecting garbage _ = gc.collect() # noqa: F841 def predict(self, docs): ''' Use the whole fingerprinting and TPOT pipeline to make adsorption energy predictions Args: docs List of dictionaries from `gaspy.gasdb.get_adsorption_docs` Returns: predictions `np.array` of TPOT's predictions of each doc uncertainties `np.array` that contains the "uncertainty prediction" for each site. In this case, it'll just be TPOT's RMSE ''' # Point predictions features = self.preprocessor.transform(docs) try: predictions = np.array(self.tpot.predict(features)) # In case we need to make a prediction from a loaded state except AttributeError: predictions = np.array(self.tpot.fitted_pipeline_.predict(features)) # "Uncertainties" will just be the RMSE residuals = np.array([prediction - doc['energy'] for prediction, doc in zip(predictions, docs)]) rmse = np.sqrt((residuals**2).mean()) uncertainties = np.array([rmse for _ in predictions]) return predictions, uncertainties def save(self): ''' Saves the state of the model into some pickles ''' with open(self._fingerprinter_cache, 'wb') as file_handle: pickle.dump(self.preprocessor, file_handle) with open(self._pipeline_cache, 'wb') as file_handle: pickle.dump(self.tpot.fitted_pipeline_, file_handle) def load(self): ''' Loads a previous state of the model from some pickles ''' with open(self._fingerprinter_cache, 'rb') as file_handle: self.preprocessor = pickle.load(file_handle) with open(self._pipeline_cache, 'rb') as file_handle: self.tpot.fitted_pipeline_ = pickle.load(file_handle) @property def _fingerprinter_cache(self): return 'fingerprinter.pkl' @property def _pipeline_cache(self): return 'tpot_pipeline.pkl'
class simpleEstimator: def __init__(self): print('Initializing') self.application_file = 'application_train.csv' self.application_test = 'application_test.csv' self.additional_data = [ 'bureau_preprocessed.csv' ] # 0.617 nachdem ich den score aufgenommen habe self.outfile = 'submission.csv' def submit(self): ofile = open(self.outfile, 'w') ofile.write('SK_ID_CURR,TARGET\n') print('Preparing submission') df = pd.read_csv(self.application_test, quotechar='"') for additional in self.additional_data: dfadd = pd.read_csv(additional) df = pd.merge(df, dfadd, on='SK_ID_CURR', how='left') tmat = df.values index = tmat[:, 0] x = tmat[:, 1:] for i in range(0, x.shape[0]): for j in range(0, x.shape[1]): x[i, j] = self.strtonum(x[i, j]) y = self.predict(x) for i in range(0, len(y)): oline = str(index[i]) + ',' + str(max(min(y[i], 1), 0)) + '\n' ofile.write(oline) ofile.close() def prepare(self): #print(df.shape) df = pd.read_csv(self.application_file, quotechar='"') for additional in self.additional_data: dfadd = pd.read_csv(additional) df = pd.merge(df, dfadd, on='SK_ID_CURR', how='left') #print(df.shape) tmat = df.values self.x = tmat[:, 2:] self.y = np.array(tmat[:, 1], dtype=np.float64) #self.y = np.array(y.astype('float'), dtype = np.float64) index = tmat[:, 0] for i in range(0, self.x.shape[0]): for j in range(0, self.x.shape[1]): self.x[i, j] = self.strtonum(self.x[i, j]) #for i in range(len(self.y)): # self.y[i] = self.strtonum(self.y[i]) def gridsearch(self, parameters): svc = GradientBoostingRegressor() self.clf = GridSearchCV(svc, parameters, verbose=1, n_jobs=4) self.clf.fit(self.x, self.y) def train(self, params): #print('Training the model') #self.clf = RandomForestRegressor() # 0.625 #self.clf = RandomForestClassifier() # 0.5 #self.clf = GradientBoostingRegressor() # 0.730 #self.clf = BaggingRegressor(KNeighborsClassifier(),max_samples=0.5, max_features=0.5) # MemoryError #self.clf = AdaBoostRegressor() # 0.674 #clf = KNeighborsRegressor() # MemoryError #self.clf = MLPRegressor(hidden_layer_sizes = (5,)) # MemoryError #self.clf = GradientBoostingRegressor(**params) #self.clf = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task= 75000, per_run_time_limit= 7500 ) self.clf = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=3) self.clf.fit(self.x, self.y) self.clf.export('tpot_best_pipeline.py') def predict(self, x): return self.clf.predict(x) def test(self): #print('Testing the model') ypred = self.predict(self.x) rmse = self.get_rmse(ypred, self.y) print('OOS RMSE: ' + str(rmse)) return rmse def strtonum(self, st): try: f = float(st) if math.isnan(f): return -10000 return f except: sta = [ord(x) for x in st] cs = 0 for i in sta: cs += i return cs def get_rmse(self, a, b): mse = 0 for i in range(0, len(a)): mse += (a[i] - b[i])**2 mse = math.sqrt(mse / len(a)) return mse
finaltrainset = train_df[usable_columns].values finaltestset = test_df[usable_columns].values from tpot import TPOTRegressor auto_classifier = TPOTRegressor(generations=3, population_size=8, verbosity=2) from sklearn.model_selection import train_test_split # Split training data to train and validate X_train, X_valid, y_train, y_valid = train_test_split(finaltrainset, y_train, train_size=0.75, test_size=0.25) auto_classifier.fit(X_train, y_train) cv_score = auto_classifier.score(X_valid, y_valid) print("The cross-validation accuracy") print(cv_score) # we need access to the pipeline to get the probabilities test_result = auto_classifier.predict(finaltestset) sub = pd.DataFrame() sub['ID'] = id_test sub['y'] = test_result sub.to_csv(base_output_path + 'tpot_analysis_{}.csv'.format(cv_score), index=False) auto_classifier.export(base_output_path + 'tpot_pipeline.py')
mutation_rate=0.9, crossover_rate=0.1, scoring="neg_mean_squared_error", cv=5, n_jobs=1, max_time_mins=5, verbosity=2, config_dict=tpot_config) auto_tpot.fit(features=X_train, target=y_train) auto_tpot.fitted_pipeline_ auto_tpot.pareto_front_fitted_pipelines_ auto_tpot.evaluated_individuals_ y_hat = auto_tpot.predict(features=X_test) # H2O AUTOML import h2o from h2o.automl import H2OAutoML # Shart h2o cluster h2o.init(max_mem_size="8G") # Upload to h2o df_train_h2o = h2o.H2OFrame( pd.concat([X_train, pd.DataFrame({"target": y_train})], axis=1)) df_test_h2o = h2o.H2OFrame(X_test) features = X_train.columns.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(train_new, train_class, train_size=0.75, test_size=0.25) #Instantiate tpot instance tpot = TPOTRegressor(verbosity=3, generations=10, population_size=50) #call fit function tpot.fit(x_train, y_train) #call the score function on cv data print('TPOT score: {}'.format(tpot.score(x_test, y_test))) #Predict temps for each month for next 5 years submission = tpot.predict(test) #create dataframe of results for each month/years final = pd.DataFrame({ 'year': test[:, 0], 'month': test[:, 1], 'Pred': submission }) #export pipeline export_filename = 'BTC Pipeline.py' tpot.export(export_filename) #export predicted values final_filename = 'btc_pred.csv' final.to_csv(final_filename, index=False)
from tpot import TPOTRegressor X_train, X_test, y_train, y_test = train_test_split(tpot_train, target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') ## predicting using tpot optimised pipeline tpot_pred = tpot.predict(tpot_test) sub1 = pd.DataFrame(data=tpot_pred) #sub1.index = np.arange(0, len(test)+1) sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'}) sub1['Item_Identifier'] = test['Item_Identifier'] sub1['Outlet_Identifier'] = test['Outlet_Identifier'] sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier'] sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']] sub1.to_csv('tpot.csv',index=False)
numeric_df = pd.DataFrame(X) numeric_df.index = all_df.index combined_df = process_categorical(numeric_df, all_df, categorical_features) X = combined_df.as_matrix() from sklearn.decomposition import PCA test_n = df.shape[0] pca = PCA() pca.fit(X[:test_n, :], price) X = pca.transform(X) X_train = X[:test_n, :] X_train, X_val, y_train, y_val = ms.train_test_split(X_train, price, test_size=0.3, random_state=0) X_test = X[test_n:, :] # housing = load_boston() # X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, # train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2) tpot.fit(X_train, y_train) y_predicted = tpot.predict(X_test) sdf['SalePrice'] = y_predicted sdf.to_csv('submission.csv') # tpot.export('tpot_kaggle_housing_pipeline.py')
class TPOTAdaptor(DFMLAdaptor, LoggableMixin): """ A dataframe adaptor for the TPOT classifiers and regressors. Args: tpot_kwargs: All kwargs accepted by a TPOTRegressor/TPOTClassifier or TPOTBase object. Note that for example, you can limit the models that TPOT explores by setting config_dict directly. For example, if you want to only use random forest: config_dict = { 'sklearn.ensemble.RandomForestRegressor': { 'n_estimators': [100], 'max_features': np.arange(0.05, 1.01, 0.05), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'bootstrap': [True, False] }, } logger (Logger, bool): A custom logger object to use for logging. Alternatively, if set to True, the default automatminer logger will be used. If set to False, then no logging will occur. Attributes: The following attributes are set during fitting. mode (str): Either AMM_REG_NAME (regression) or AMM_CLF_NAME (classification) features (list): The features labels used to develop the ml model. ml_data (dict): The raw ml data used for training. best_pipeline (sklearn.Pipeline): The best fitted pipeline found. best_models (OrderedDict): The best model names and their scores. backend (TPOTBase): The TPOT object interface used for ML training. is_fit (bool): If True, the adaptor and backend are fit to a dataset. models (OrderedDict): The raw sklearn-style models output by TPOT. fitted_target (str): The target name in the df used for training. """ def __init__(self, logger=True, **tpot_kwargs): tpot_kwargs['cv'] = tpot_kwargs.get('cv', 5) tpot_kwargs['n_jobs'] = tpot_kwargs.get('n_jobs', -1) tpot_kwargs['verbosity'] = tpot_kwargs.get('verbosity', 2) self.mode = None self._backend = None self.tpot_kwargs = tpot_kwargs self.fitted_target = None self._features = None self.models = None self._logger = self.get_logger(logger) self.is_fit = False self.random_state = tpot_kwargs.get('random_state', None) self._ml_data = None self.greater_score_is_better = None @log_progress(AMM_LOG_FIT_STR) @set_fitted def fit(self, df, target, **fit_kwargs): """ Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe. Args: df (pandas.DataFrame): The df to be used for training. target (str): The key used to identify the machine learning target. **fit_kwargs: Keyword arguments to be passed to the TPOT backend. These arguments must be valid arguments to the TPOTBase class. Returns: TPOTAdaptor (self) """ # Prevent goofy pandas casting by casting to native y = df[target].values.tolist() X = df.drop(columns=target).values.tolist() # Determine learning type based on whether classification or regression self.mode = regression_or_classification(df[target]) if self.mode == AMM_CLF_NAME: self.tpot_kwargs['config_dict'] = self.tpot_kwargs.get( 'config_dict', TPOT_CLASSIFIER_CONFIG) if "scoring" not in self.tpot_kwargs: self.tpot_kwargs["scoring"] = "balanced_accuracy" self._backend = TPOTClassifier(**self.tpot_kwargs) elif self.mode == AMM_REG_NAME: self.tpot_kwargs['config_dict'] = self.tpot_kwargs.get( 'config_dict', TPOT_REGRESSOR_CONFIG) if "scoring" not in self.tpot_kwargs: self.tpot_kwargs["scoring"] = "neg_mean_absolute_error" self._backend = TPOTRegressor(**self.tpot_kwargs) else: raise ValueError("Learning type {} not recognized as a valid mode " "for {}".format(self.mode, self.__class__.__name__)) self._features = df.drop(columns=target).columns.tolist() self._ml_data = {"X": X, "y": y} self.fitted_target = target self._backend = self._backend.fit(X, y, **fit_kwargs) return self @property @check_fitted def best_models(self): """ The best models found by TPOT, in order of descending performance. If you want a pipeline you can use to make predtions, use the best_pipeline. Performance is evaluated based on the TPOT scoring. This can be changed by passing a "scoring" kwarg into the __init__ method. Returns: best_models_and_scores (dict): Keys are names of models. Values are the best internal cv scores of that model with the best hyperparameter combination found. """ self.greater_score_is_better = is_greater_better( self.backend.scoring_function) # Get list of evaluated model names, cast to set and back # to get unique model names, instantiate ordered model dictionary evaluated_models = [ key.split('(')[0] for key in self.backend.evaluated_individuals_.keys() ] model_names = list(set(evaluated_models)) models = OrderedDict({model: [] for model in model_names}) # This makes a dict of model names mapped to all runs of that model for key, val in self.backend.evaluated_individuals_.items(): models[key.split('(')[0]].append(val) # For each base model type sort the runs by best score for model_name in model_names: models[model_name].sort(key=lambda x: x['internal_cv_score'], reverse=self.greater_score_is_better) # Gets a simplified dict of the model to only its best run # Sort the best individual models by type to best models overall best_models = OrderedDict( sorted({model: models[model][0] for model in models}.items(), key=lambda x: x[1]['internal_cv_score'], reverse=self.greater_score_is_better)) # Mapping of top models to just their score scores = { model: best_models[model]['internal_cv_score'] for model in best_models } # Sorted dict of top models just mapped to their top scores best_models_and_scores = OrderedDict( sorted(scores.items(), key=lambda x: x[1], reverse=self.greater_score_is_better)) self.models = models return best_models_and_scores @log_progress(AMM_LOG_PREDICT_STR) @check_fitted def predict(self, df, target): """ Predict the target property of materials given a df of features. The predictions are appended to the dataframe in a column called: "{target} predicted" Args: df (pandas.DataFrame): Contains all features needed for ML (i.e., all features contained in the training dataframe. target (str): The property to be predicted. Should match the target used for fitting. May or may not be present in the argument dataframe. Returns: (pandas.DataFrame): The argument dataframe plus a column containing the predictions of the target. """ if target != self.fitted_target: raise AutomatminerError( "Argument dataframe target {} is different " "from the fitted dataframe target! {}" "".format(target, self.fitted_target)) elif not all([f in df.columns for f in self._features]): not_in_model = [f for f in self._features if f not in df.columns] not_in_df = [f for f in df.columns if f not in self._features] raise AutomatminerError( "Features used to build model are different" " from df columns! Features located in " "model not located in df: \n{} \n Features " "located in df not in model: \n{}" "".format(not_in_df, not_in_model)) else: X = df[self._features].values # rectify feature order y_pred = self._backend.predict(X) df[target + " predicted"] = y_pred return df @property @check_fitted def best_pipeline(self): return self._backend.fitted_pipeline_ @property @check_fitted def features(self): return self._features @property @check_fitted def ml_data(self): return self._ml_data @property @check_fitted def backend(self): return self._backend
test_size=0.25, random_state=seed) tpot = TPOTRegressor(generations=gen, population_size=50, verbosity=2, n_jobs=cores) tpot.fit(X_train, y_train.reshape(-1, )) features_readable = list() for t in range(len(test_features[test_no])): features_readable.append(d[test_features[test_no][t]]) x = linspace(n1 + 1, n1 + sample_n, sample_n) fig, ax = plt.subplots() fig.set_size_inches(22, 13) plt.plot(x, tpot.predict(X_test)[n1:n1 + sample_n]) plt.plot(x, y_test[n1:n1 + sample_n]) ax.set(xlabel='sample no', ylabel='FO flow m3/h', title='Training number:' + str(test_no) + '\nFeatures: \n ' + str(features_readable)) ax.grid() fig.savefig("results/test_no_" + str(test_no) + ".png") # In[218]: # Train linear models #
population_size=100, n_jobs=4, verbosity=2, cv=3, early_stop=3 ) model.fit(X_train, y_train.values) # In[5]: def rmsle_metric(y_test, y_pred) : assert len(y_test) == len(y_pred) y_test = np.exp(y_test)-1 y_pred = np.exp(y_pred)-1 rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2)) return rmsle y_pred = model.predict(X_test) print(rmsle_metric(y_test, y_pred)) # In[6]: from sklearn.externals import joblib joblib.dump(model.fitted_pipeline_, 'PCA_y_log_TPOT_1_475.pkl')
# the optimization process if there is no improvement # verbosity - integer, default=0, # how much information TPOT communicates while it's running, verbosity=2 means # TPOT will print more information and provide a progress bar # verbosity=3 means TPOT will print everything and provide a progress bar # max_time_mins - integer or None, default=None, # it defines how many minutes TPOT has to optimize the pipeline # Start a timer import time start = time.time() tpot.fit(X_train, y_train) tpot.export('TPOT_RF_Pers_E_Shopping.py') results = tpot.predict(X_test) y_pred_GP = results print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_GP)) #print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_GP)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_GP))) # New score errors = abs(y_test - y_pred_GP) mape = 100 * (errors / y_test) # Calculate and display accuracy_tpot accuracy_tpot = 100 - np.mean(mape) print('Accuracy_tpot:', round(accuracy_tpot, 2), '%.') print('Improvement of Accuracy with TPOT_Regression of: {:0.2f}%.'.format(