y_pred = model.predict(X_test) # predict y_pred = scaler.inverse_transform(y_pred) # inverse StandardScaler y_pred_pd = pd.DataFrame(data=y_pred, columns=['Toughness']) y_pred_pd = y_pred_pd.reset_index() y_pred_pd.to_csv('n96084094_HW4_1.csv', index=False) # save the data # In[7]: # Fully connected (Dense) from keras import models from keras import layers model = models.Sequential() # set model model.add(layers.Dense(16, activation='relu', input_shape=(64, ))) # 16 output model.add(layers.Dense(16, activation='relu')) # 16 output model.add(layers.Dense(1, activation='linear')) # 1 output, y = a(wx + b), a = 1 # In[8]: model.compile( optimizer='rmsprop', loss='mean_squared_error', # regression problems metrics=['mse' ]) # regression problems, MSE, MAE, MAPE, Cosine, not accuracy # In[9]: history = model.fit(X_train,
result = pd.Series(cv_model, index=alphas) result.plot() print(result.min()) plt.title('lasso with alphas') plt.show() ''' # ridge ------------------------------------------ alphas = [10,20,30,40,50,60,70,80,90,100,110,130] # 80 cv_model = [rmse_cv(Ridge(alpha=n), more_train, y_train0).mean() for n in alphas] result = pd.Series(cv_model, index = alphas) print(result.min()) result.plot() plt.title('ridge with alphas') plt.show() ''' ''' # DNN ----------------------------------------------- from keras.models import Sequential from keras.layers.core import Dense,Activation,Dropout from keras.utils import np_utils model = Sequential() model.add(Dense(output_dim=1000, input_dim=len(X_train.columns), activation='relu')) model.add(Dense(output_dim=200, input_dim=1000, activation='relu')) model.add(Dense(output_dim=1, input_dim=200, activation='relu')) model.compile(loss = 'mean_squared_error', optimizer = 'adam') model.fit(X_train.as_matrix(), y_train.as_matrix(), nb_epoch = 200, batch_size = 100) pred=model.predict(X_test.as_matrix()).reshape(len(X_test)) ''' ''' # gbrt ----------------------------------------- num = [150,200,250] #best 200
regressor = DecisionTreeRegressor() regressor.fit(X, y) final_prediction = regressor.predict(test) #Run support vector regression from sklearn.svm import SVR regressor = SVR(kernel='linear') regressor.fit(X, y) final_prediction = regressor.predict(test) #run linear regression from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X, y) final_prediction = regressor.predict(test) """ #Run ANN import keras from keras.models import Sequential from keras.layers import Dense from keras.models import model_from_json import os regressor = Sequential() regressor.add(Dense(units = 61, kernel_initializer = 'uniform', activation = 'sigmoid', input_dim = 121)) # Adding the second hidden layer regressor.add(Dense(units= 61, kernel_initializer ='uniform', activation = 'sigmoid')) #here we dont use ant activation function for regression problem regressor.add(Dense(units = 1, kernel_initializer = 'uniform')) X=np.array(X) y=np.array(y)
def create_model_output(model_type, input_values_combined, output_values_range, count, clf, no_epochs): if model_type == 'regression': ''' ML regression Model ''' [xtrain, xtest, ytrain, ytest] = splitPreProcess(input_values_combined, output_values_range, test_window, lstm_history) [pred_train, clf] = ML_linear(xtrain, ytrain) pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest[lstm_history:, :]) pred_train = pred_train.ravel() pred_test = pred_test.ravel() ytest = ytest[lstm_history:] elif model_type == 'neural net': ''' NN regression Model ''' [xtrain, xtest, ytrain, ytest] = splitPreProcess(input_values_combined, output_values_range, test_window, lstm_history) [pred_train, optimal_size, optimal_alpha, clf] = ML_Optimizer_NN(xtrain, ytrain, xtest, ytest, range_size, range_alpha) pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest[lstm_history:, :]) pred_train = pred_train.ravel() pred_test = pred_test.ravel() ytest = ytest[lstm_history:] elif model_type == 'RF': [xtrain, xtest, ytrain, ytest] = splitPreProcess(input_values_combined, output_values_range, test_window, lstm_history) clf = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=50) clf.fit(xtrain, ytrain) print(clf.feature_importances_) pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest[lstm_history:, :]) pred_train = pred_train.ravel() pred_test = pred_test.ravel() ytest = ytest[lstm_history:] elif model_type == 'lstm': ''' LSTM regression Model ''' [xtrain, xtest, ytrain, ytest] = splitPreProcess(input_values_combined, output_values_range, test_window, lstm_history) xtrain = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1])) xtest = xtest.reshape((xtest.shape[0], 1, xtest.shape[1])) if count == 0: clf = Sequential() clf.add(LSTM(200, input_shape=(xtrain.shape[1], xtrain.shape[2]))) clf.add(Dense(1)) clf.compile(loss='mae', optimizer='adam') history = clf.fit(xtrain, ytrain, epochs=10, batch_size=5, validation_data=(xtrain, ytrain), verbose=1, shuffle=True) pyplot.plot(history.history['loss'], label='train') pyplot.plot(history.history['val_loss'], label='test') pyplot.legend() pyplot.show() pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest[lstm_history:, :]) pred_train = pred_train.ravel() pred_test = pred_test.ravel() ytest = ytest[lstm_history:] s = pickle.dumps(clf) else: clf = pickle.loads(s) pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest[lstm_history:, :]) pred_train = pred_train.ravel() pred_test = pred_test.ravel() ytest = ytest[lstm_history:] elif model_type == 'lstm_3D': ''' LSTM regression Model ''' [xtrain, xtest, ytrain, ytest] = splitPreProcess(input_values_combined, output_values_range, test_window, lstm_history) [xtrain_3D, xtest_3D, ytrain_3D, ytest_3D] = create3D(xtrain, xtest, ytrain, ytest, lstm_history) xtrain = xtrain_3D xtest = xtest_3D ytrain = ytrain_3D ytest = ytest_3D if (count == 0) or (count > 0): clf = Sequential() clf.add(LSTM(200, input_shape=(xtrain.shape[1], xtrain.shape[2]))) clf.add(Dense(1)) clf.compile(loss='mae', optimizer='adam') history = clf.fit(xtrain, ytrain, epochs=no_epochs, batch_size=10, validation_data=(xtrain, ytrain), verbose=1, shuffle=True) pyplot.plot(history.history['loss'], label='train') pyplot.plot(history.history['val_loss'], label='test') pyplot.legend() pyplot.show() pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest) pred_train = pred_train.ravel() pred_test = pred_test.ravel() s = pickle.dumps(clf) else: #clf = pickle.loads(s) pred_train = clf.predict(xtrain) pred_test = clf.predict(xtest) pred_train = pred_train.ravel() pred_test = pred_test.ravel() corr_coefficient_train = np.corrcoef(pred_train, ytrain) corr_coefficient_test = np.corrcoef(pred_test, ytest) final_performance.append([ model_type, np.sum(abs(pred_test - ytest)), corr_coefficient_test[0, 1], np.sum(abs(pred_train - ytrain)), corr_coefficient_train[0, 1] ]) print(final_performance) df_performance = pd.DataFrame(final_performance) df_performance.to_csv('df_performance.csv', index=False, header=True) return pred_train, pred_test, ytrain, ytest, clf
def get_baseline(self, cv_mode=True, test_mode=False): """ Computes a loss baseline for the ML-algorithm based on its default hyperparameter configuration (either cross validation loss or test loss after full training) :param cv_mode: bool Flag that indicates, whether to perform cross validation or simple validation :param test_mode: bool Flag that indicates, whether to compute the loss on the test set or not :return: baseline: float Loss of the baseline HP-configuration. """ if self.is_time_series: # Use TimeSeriesSplit for time series data kf = TimeSeriesSplit(n_splits=5) else: # Create K-Folds cross validator for all other data types kf = KFold(n_splits=5, shuffle=self.shuffle) cv_baselines = [] cv_iter = 0 # Iterate over the cross validation splits for train_index, val_index in kf.split(X=self.x_train): cv_iter = cv_iter + 1 # Cross validation if cv_mode and not test_mode: x_train_cv, x_val_cv = self.x_train.iloc[train_index], self.x_train.iloc[val_index] y_train_cv, y_val_cv = self.y_train.iloc[train_index], self.y_train.iloc[val_index] # Separate a validation set, but do not perform cross validation elif not cv_mode and not test_mode and cv_iter < 2: x_train_cv, x_val_cv, y_train_cv, y_val_cv = train_test_split(self.x_train, self.y_train, test_size=0.2, shuffle=self.shuffle, random_state=0) # Training on full training set and evaluation on test set elif not cv_mode and test_mode and cv_iter < 2: x_train_cv, x_val_cv = self.x_train, self.x_test y_train_cv, y_val_cv = self.y_train, self.y_test elif cv_mode and test_mode: raise Exception('Cross validation is not implemented for test mode.') # Iteration doesn't make sense for non cross validation else: continue if self.ml_algorithm == 'RandomForestRegressor': model = RandomForestRegressor(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'RandomForestClassifier': model = RandomForestClassifier(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'SVR': model = SVR(cache_size=500) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'SVC': model = SVC(random_state=0, cache_size=500) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'AdaBoostRegressor': model = AdaBoostRegressor(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'AdaBoostClassifier': model = AdaBoostClassifier(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'DecisionTreeRegressor': model = DecisionTreeRegressor(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'DecisionTreeClassifier': model = DecisionTreeClassifier(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'LinearRegression': model = LinearRegression() model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'KNNRegressor': model = KNeighborsRegressor() model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'KNNClassifier': model = KNeighborsClassifier() model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'LogisticRegression': model = LogisticRegression() model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'NaiveBayes': model = GaussianNB() model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'MLPRegressor': model = MLPRegressor(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'MLPClassifier': model = MLPClassifier(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'ElasticNet': model = ElasticNet(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'KerasRegressor' or self.ml_algorithm == 'KerasClassifier': # Use the warmstart configuration to create a baseline for Keras models epochs = 100 # Initialize the neural network model = keras.Sequential() # Add input layer model.add(keras.layers.InputLayer(input_shape=len(x_train_cv.keys()))) # Add first hidden layer if warmstart_keras['hidden_layer1_size'] > 0: model.add( keras.layers.Dense(warmstart_keras['hidden_layer1_size'], activation=warmstart_keras['hidden_layer1_activation'])) model.add(keras.layers.Dropout(warmstart_keras['dropout1'])) # Add second hidden layer if warmstart_keras['hidden_layer2_size'] > 0: model.add( keras.layers.Dense(warmstart_keras['hidden_layer2_size'], activation=warmstart_keras['hidden_layer2_activation'])) model.add(keras.layers.Dropout(warmstart_keras['dropout2'])) # Add output layer if self.ml_algorithm == 'KerasRegressor': model.add(keras.layers.Dense(1, activation='linear')) # Select optimizer and compile the model adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr']) model.compile(optimizer=adam, loss='mse', metrics=['mse']) elif self.ml_algorithm == 'KerasClassifier': # Determine the number of different classes depending on the data format if type(y_train_cv) == pd.core.series.Series: num_classes = int(max(y_train_cv) - min(y_train_cv) + 1) elif type(y_train_cv) == pd.core.frame.DataFrame: num_classes = len(y_train_cv.keys()) else: raise Exception('Unknown data format!') # Binary classification if num_classes <= 2: # 'Sigmoid is equivalent to a 2-element Softmax, where the second element is assumed to be zero' # https://keras.io/api/layers/activations/#sigmoid-function model.add(keras.layers.Dense(1, activation='sigmoid')) adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr']) model.compile(optimizer=adam, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy']) # Multiclass classification else: # Use softmax activation for multiclass clf. -> 'Softmax converts a real vector to a vector of # categorical probabilities.[...]the result could be interpreted as a probability distribution.' # https://keras.io/api/layers/activations/#softmax-function model.add(keras.layers.Dense(num_classes, activation='softmax')) adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr']) model.compile(optimizer=adam, loss=keras.losses.CategoricalCrossentropy(), metrics=[keras.metrics.CategoricalAccuracy()]) # Learning rate schedule if warmstart_keras["lr_schedule"] == "cosine": schedule = functools.partial(cosine, initial_lr=warmstart_keras["init_lr"], T_max=epochs) elif warmstart_keras["lr_schedule"] == "exponential": schedule = functools.partial(exponential, initial_lr=warmstart_keras["init_lr"], T_max=epochs) elif warmstart_keras["lr_schedule"] == "constant": schedule = functools.partial(fix, initial_lr=warmstart_keras["init_lr"]) else: raise Exception('Unknown learning rate schedule!') # Determine the learning rate for this iteration and pass it as callback lr = keras.callbacks.LearningRateScheduler(schedule) # Early stopping callback early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto', restore_best_weights=True) callbacks_list = [lr, early_stopping] # Train the model model.fit(x_train_cv, y_train_cv, epochs=epochs, batch_size=warmstart_keras['batch_size'], validation_data=(x_val_cv, y_val_cv), callbacks=callbacks_list, verbose=0) # Make the prediction y_pred = model.predict(x_val_cv) # In case of binary classification round to the nearest integer if self.ml_algorithm == 'KerasClassifier': # Binary classification if num_classes <= 2: y_pred = np.rint(y_pred) # Multiclass classification else: # Identify the predicted class (maximum probability) in each row for row_idx in range(y_pred.shape[0]): # Predicted class this_class = np.argmax(y_pred[row_idx, :]) # Iterate over columns / classes for col_idx in range(y_pred.shape[1]): if col_idx == this_class: y_pred[row_idx, col_idx] = 1 else: y_pred[row_idx, col_idx] = 0 # KerasRegressor else: y_pred = np.reshape(y_pred, newshape=(-1,)) elif self.ml_algorithm == 'XGBoostRegressor': model = XGBRegressor(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'XGBoostClassifier': model = XGBClassifier(random_state=0) model.fit(x_train_cv, y_train_cv) y_pred = model.predict(x_val_cv) elif self.ml_algorithm == 'LGBMRegressor' or self.ml_algorithm == 'LGBMClassifier': # Create lgb datasets train_data = lgb.Dataset(x_train_cv, label=y_train_cv) valid_data = lgb.Dataset(x_val_cv, label=y_val_cv) # Specify the ML task and the random seed if self.ml_algorithm == 'LGBMRegressor': # Regression task params = {'objective': 'regression', 'seed': 0} elif self.ml_algorithm == 'LGBMClassifier': # Determine the number of classes num_classes = int(max(y_train_cv) - min(y_train_cv) + 1) # Binary classification task if num_classes <= 2: params = {'objective': 'binary', 'seed': 0} # Multiclass classification task else: params = {'objective': 'multiclass', # uses Softmax objective function 'num_class': num_classes, 'seed': 0} lgb_clf = lgb.train(params=params, train_set=train_data, valid_sets=[valid_data], verbose_eval=False) # Make the prediction y_pred = lgb_clf.predict(data=x_val_cv) # Classification task if self.ml_algorithm == 'LGBMClassifier': # Binary classification: round to the nearest integer if num_classes <= 2: y_pred = np.rint(y_pred) # Multiclass classification: identify the predicted class based on the one-hot-encoded probabilities else: y_one_hot_proba = np.copy(y_pred) n_rows = y_one_hot_proba.shape[0] y_pred = np.zeros(shape=(n_rows, 1)) # Identify the predicted class for each row (highest probability) for row in range(n_rows): y_pred[row, 0] = np.argmax(y_one_hot_proba[row, :]) else: raise Exception('Unknown ML-algorithm!') # Add remaining ML-algorithms here cv_baselines.append(self.metric(y_val_cv, y_pred)) if cv_mode: # Compute the average cross validation loss baseline = np.mean(cv_baselines) else: baseline = cv_baselines[0] return baseline
def train(self): start = timeit.default_timer() train_x, train_y, feature_list = self.feature_extraction() self._feature_size = [train_x.shape[1], 1] self._features = feature_list if not self._silent: print "Train has %d instances " % (len(train_x)) counts = Counter(train_y) expectation_ratio = 1 / float(len(counts.keys())) n_samples = len(train_y) for key, value in counts.items(): tmp = float(expectation_ratio) / (float(value) / float(n_samples)) if (tmp > 6) | (tmp < (1.0 / 6.0)): self._data_balance = True extra_fit_args = dict() if self._weight_col is not None: extra_fit_args['sample_weight'] = train_x[self._weight_col].values del train_x[self._weight_col] if 0 < self._bootstrap < 1.0: if self._bootstrap_seed is not None: if not self._silent: print "Setting bootstrap seed to %d" % self._bootstrap_seed np.random.seed(self._bootstrap_seed) random.seed(self._bootstrap_seed) bootstrap_len = int(math.floor(self._bootstrap * len(train_x))) bootstrap_ix = random.sample(range(len(train_x)), bootstrap_len) train_x = train_x.iloc[bootstrap_ix] train_x.reset_index() train_y = train_y.iloc[bootstrap_ix] train_y.reset_index() model = None if self._model_type == "RandomForestRegressor": if model is None: if self._data_balance is True: self._fit_args.update({"class_weight": "balanced"}) model = RandomForestRegressor(**self._fit_args) model.fit(X=train_x, y=train_y, **extra_fit_args) self._model = model self._predict = lambda (fitted_model, pred_x ): self.continuous_predict(x=pred_x) self._have_feat_importance = True elif self._model_type == "RandomForestClassifier": if model is None: # if self._data_balance is True: # self._fit_args.update({"class_weight": "balanced"}) model = RandomForestClassifier(**self._fit_args) model.fit(X=train_x, y=train_y, **extra_fit_args) self._model = model self._predict = lambda (fitted_model, pred_x): self.pred_proba( x=pred_x) self._staged_predict = lambda ( fitted_model, pred_x): [self._predict((fitted_model, pred_x))] self._have_feat_importance = True elif self._model_type == "ExtraTreesRegressor": if model is None: if self._data_balance is True: self._fit_args.update({"class_weight": "balanced"}) model = ExtraTreesRegressor(**self._fit_args) model.fit(X=train_x, y=train_y, **extra_fit_args) self._model = model self._predict = lambda (fitted_model, pred_x ): self.continuous_predict(x=pred_x) self._have_feat_importance = True elif self._model_type == "ExtraTreesClassifier": if model is None: if self._data_balance is True: self._fit_args.update({"class_weight": "balanced"}) model = ExtraTreesClassifier(**self._fit_args) model.fit(X=train_x, y=train_y, **extra_fit_args) self._predict = lambda (fitted_model, pred_x): self.pred_proba( x=pred_x) self._staged_predict = lambda ( fitted_model, pred_x): [self._predict((fitted_model, pred_x))] self._have_feat_importance = True elif self._model_type == "GradientBoostingRegressor": if model is None: model = GradientBoostingRegressor(**self._fit_args) model.fit(X=train_x, y=train_y, **extra_fit_args) self._model = model elif self._load_type == "fit_more": model.warm_start = True model.n_estimators += self._fit_args['n_estimators'] model.fit(X=train_x, y=train_y) self._model = model self._predict = lambda (fitted_model, pred_x ): self.continuous_predict(x=pred_x) self._staged_predict = lambda ( fitted_model, pred_x): self.staged_pred_continuous(x=pred_x) if self._load_type == "pred_at" and self._fit_args[ 'n_estimators'] < model.n_estimators: if not self._silent: print("Predict using %d trees" % self._fit_args['n_estimators']) self._predict = lambda ( fitted_model, pred_x): self.staged_pred_continuous_at_n( x=pred_x, n=self._fit_args['n_estimators']) elif self._model_type == "GradientBoostingClassifier": if model is None: model = GradientBoostingClassifier(**self._fit_args) model.fit(X=train_x, y=train_y, **extra_fit_args) self._model = model elif self._load_type == "fit_more": model.warm_start = True model.n_estimators += self._fit_args['n_estimators'] model.fit(X=train_x, y=train_y) self._model = model self._staged_predict = lambda ( fitted_model, pred_x): self.staged_pred_proba(x=pred_x) self._predict = lambda (fitted_model, pred_x): self.pred_proba( x=pred_x) if self._load_type == "pred_at" and self._fit_args[ 'n_estimators'] < model.n_estimators: if not self._silent: print("Predict using %d trees" % self._fit_args['n_estimators']) self._predict = lambda ( fitted_model, pred_x): self.staged_pred_proba_at_n( x=pred_x, n=self._fit_args['n_estimators']) elif self._model_type == "LogisticRegression": if model is None: if self._data_balance is True: self._fit_args.update({"class_weight": "balanced"}) model = LogisticRegression(**self._fit_args) model.fit(X=train_x, y=train_y) self._model = model self._predict = lambda (fitted_model, pred_x): self.pred_proba( x=pred_x) self._staged_predict = lambda ( fitted_model, pred_x): [self._predict((fitted_model, pred_x))] elif self._model_type == "SVC": if model is None: if self._data_balance is True: self._fit_args.update({"class_weight": "balanced"}) model = sklearn.svm.SVC(**self._fit_args) model.fit(X=train_x, y=train_y) self._model = model self._predict = lambda (fitted_model, pred_x): self.pred_proba( x=pred_x) self._staged_predict = lambda ( fitted_model, pred_x): [self._predict((fitted_model, pred_x))] elif self._model_type == "CNN": if model is None: train_data = load_pd_df(self._input_dir + '/train.csv') indices, max_len = self.process_date_list( train_data['Date'].map( lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))) self._feature_size = [train_x.shape[1], max_len] NB_FILTER = [64, 128] NB_Size = [4, 3, 3] FULLY_CONNECTED_UNIT = 256 model = Sequential() model.add( Conv2D(NB_FILTER[0], (train_x.shape[1], NB_Size[0]), input_shape=train_x.shape, border_mode='valid', activation='relu')) model.add(MaxPooling2D(pool_size=(1, 3))) model.add( Conv2D(NB_FILTER[1], (1, NB_Size[1]), border_mode='valid')) model.add(MaxPooling2D(pool_size=(1, 3))) model.add(Flatten()) model.add( Dense(FULLY_CONNECTED_UNIT, activation='relu', W_constraint=maxnorm(3), kernel_regularizer=regularizers.l2(0.01))) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=Adamax(), metrics=['accuracy']) model.fit(train_x, train_y, batch_size=16, epochs=50, verbose=1) elif self._model_type == "LSTM": if model is None: train_data = load_pd_df(self._input_dir + '/train.csv') indices, max_len = self.process_date_list( train_data['Date'].map( lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))) self._feature_size = [train_x.shape[1], max_len] class_weight = { 1: np.divide(float(n_samples), float( (len(counts) * counts[1]))), 0: np.divide(float(n_samples), float( (len(counts) * counts[0]))) } # class_weight = {1: 10, # 0: 1} model = CNN_LSTM( (self._feature_size[0], 4), (None, self._feature_size[0], self._feature_size[1], 1)) model.fit_generator( generator=self.generator(train_x, train_y, indices, max_len), epochs=20, class_weight=class_weight, steps_per_epoch=train_x.shape[0] / self._batch_size) # model.fit_generator(generator=self.generator(train_x, train_y, indices, max_len), # epochs=1, class_weight=class_weight, steps_per_epoch=1) self._model = model elif self._model_type == "Pipeline": if model is None: model = Pipeline([ ('pre_process', get_class(self._fit_args['pre_process']['name'])( self._fit_args['pre_process']['args'])), ('model', get_class(self._fit_args['model']['name'])( self._fit_args['model']['args'])) ]) model.fit(X=train_x, y=train_y) self._model = model self._predict = lambda (fitted_model, pred_x): self.pred_proba( x=pred_x) self._staged_predict = lambda ( fitted_model, pred_x): [self._predict((fitted_model, pred_x))] if not self._silent: stop = timeit.default_timer() print "Train time: %d s" % (stop - start) del train_x, train_y
def train_data(X_train, X_test, y_train, y_test, prompt): if prompt == "RF": model = RandomForestRegressor(max_depth=20) regressor = "Random Forest" elif prompt == "KNN": model = KNeighborsRegressor(n_neighbors=5) regressor = "KNN" elif prompt == "DT": model = DecisionTreeRegressor(max_depth=10) regressor = "Decision Tree" elif prompt == "LR": model = LinearRegression() regressor = "Linear Regression" elif prompt == "LSVR": model = SVR(kernel='linear') regressor = "Linear SVR" elif prompt == "RBFSVR": model = SVR(kernel='rbf') regressor = "RBF Kernel SVR" elif prompt == "PSVR": model = SVR(kernel='poly') regressor = "Polynomial SVR" elif prompt == "ANN": regressor = "ANN" model = Sequential() model.add(Dense(input_dim=4, units=6, activation='tanh')) model.add(Dense(units=4, activation='tanh')) model.add(Dense(units=4, activation='tanh')) model.add(Dense(units=3, activation='relu')) model.compile(loss="mse", metrics=['mae'], optimizer='adam') else: print("Please enter a valid regression model!") assert False print(f"\nWORKING FOR {regressor.upper()} MODEL") # optimal_r2 = 0 coffs, intercepts = None, None if "SVR" in prompt: y_pred = [] r2s = [] coffs = [] intercepts = [] mapping = {0:"a", 1:"b", 2:"c"} for i in mapping: model.fit(X_train, y_train[:, i]) y_pred.append(model.predict(X_test)) r2s.append(round(r2_score(y_test[:, i], model.predict(X_test)), 3)) if prompt == "LSVR": coffs.append(np.round(model.coef_, 3)) else: coffs.append(np.round(model.dual_coef_, 3)) intercepts.append(np.round(model.intercept_[0], 3)) y_pred = np.array(y_pred).T result = round(np.array(r2s).mean(), 3) print(f"R2 Score: {result}") else: if prompt == "ANN": model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=0) else: model.fit(X_train, y_train) y_pred = model.predict(X_test) result = round(r2_score(y_test, y_pred), 3) print(f"R2 Score: {result}") return y_pred, regressor, model, result, coffs, intercepts
def get_baseline_loss(self): """ Computes the loss for the default hyperparameter configuration of the ML-algorithm (baseline). :return: baseline_loss: float Validation loss of the baseline HP-configuration """ if self.ml_algorithm == 'RandomForestRegressor': model = RandomForestRegressor(random_state=0) model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'SVR': model = SVR() model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'AdaBoostRegressor': model = AdaBoostRegressor(random_state=0) model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'DecisionTreeRegressor': model = DecisionTreeRegressor(random_state=0) model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'LinearRegression': model = LinearRegression() model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'KNNRegressor': model = KNeighborsRegressor() model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'KerasRegressor': # >>> What are default parameters for a keras model? # Baseline regression model from: https://www.tensorflow.org/tutorials/keras/regression#full_model model = keras.Sequential() model.add(keras.layers.InputLayer(input_shape=len(self.x_train.keys()))) model.add(keras.layers.Dense(64, activation='relu')) model.add(keras.layers.Dense(64, activation='relu')) model.add(keras.layers.Dense(1)) model.compile(loss='mse', optimizer=keras.optimizers.Adam(0.001)) model.fit(self.x_train, self.y_train, epochs=100, validation_data=(self.x_val, self.y_val), verbose=0) y_pred = model.predict(self.x_val) elif self.ml_algorithm == 'XGBoostRegressor': model = XGBRegressor(random_state=0) model.fit(self.x_train, self.y_train) y_pred = model.predict(self.x_val) else: raise Exception('Unknown ML-algorithm!') # Add remaining ML-algorithms here baseline_loss = self.metric(self.y_val, y_pred) return baseline_loss
def create_new(cls, algorithm, par): """ Create new untrained model. Parameters: ---------- algorithm: str (rf/pca_rf/ann/cnn) which algorithm will be used par: dict For RF: - n_estimators: suggest using 1000 - max_features: suggest using 25 For ANN: - neurons: suggest using [200] - l2_lambda: suggest using 0 For CNN: - conv_nodes: suggest using [32, 128] - dense_nodes: suggest using [512, 512] - dropout_p: suggest using [0.1, 0.5] """ seed = 369 if (algorithm == 'rf') or (algorithm == 'pca_rf'): model = RandomForestRegressor(n_estimators=par['n_estimators'], max_features=par['max_features'], min_samples_leaf=1, random_state=seed, n_jobs=-1) elif algorithm == 'ann': model = models.Sequential() model.add( layers.Dense(par['neurons'][0], activation='relu', input_shape=(441, ), kernel_regularizer=regularizers.l2( par['l2_lambda']))) if len(par['neurons']) > 1: # add more hidden layers if `len(neurons) > 1` for n in par['neurons'][1:]: model.add( layers.Dense( n, activation='relu', kernel_regularizer=regularizers.l2(l2_lambda))) # add output layer model.add(layers.Dense(2, activation='softmax')) # define optimizer = RMS # low learning rate avoids over shoot of correction optimizer = optimizers.RMSprop(lr=1e-4) # compile model, using accuracy to fit training data model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) elif algorithm == 'cnn': model = models.Sequential() model.add( Conv2D(par['conv_nodes'][0], 3, 3, activation='relu', input_shape=(21, 21, 1), dim_ordering="tf")) model.add(MaxPooling2D((2, 2), dim_ordering="tf")) if len(par['dropout_p']) != 0: model.add(Dropout(par['dropout_p'][0])) if len(par['conv_nodes']) > 1: for i in par['conv_nodes'][1:]: model.add( Conv2D(i, 3, 3, activation='relu', dim_ordering="tf")) model.add(MaxPooling2D((2, 2), dim_ordering="tf")) if len(par['dropout_p']) != 0: model.add(Dropout(par['dropout_p'][0])) model.add(Flatten()) for n in par['dense_nodes']: model.add(Dense(n)) model.add(Activation("relu")) if len(par['dropout_p']) != 0: model.add(Dropout(par['dropout_p'][1])) model.add(Dense(2)) model.add(Activation('softmax')) lr = 0.1 decay = lr / 50 optimizer = optimizers.SGD(lr=lr, decay=decay, nesterov=True) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) return cls(model, algorithm)
def main(): # 데이터 분리 dummy_list, x_val, y_val, id_val, month_val, Train, Predict, model_name, month_name = read_data_info( read_data_file, read_col_info_file, read_model_info_file) # 샘플뽑아 진행 (생략가능) Train, Predict = small_sample(train_num, test_num, Train, Predict) # make dataset X_Train_df, y_Train_df, X_Predict_df, y_Predict_df = make_model_df( dummy_list, x_val, y_val, id_val, month_val, Train, Predict, month_name) # 차분공식 X_Train_df, y_Train_df, X_Predict_df, y_Predict_df = get_diff_df( X_Train_df, X_Predict_df, y_Train_df, y_Predict_df) # -------------- 모델 선택 ------------------- if model_name == 'logit': # 로지스틱 # 현재 dataset은 logit에 맞는 형태가 아니기에 임의로 변경해서 확인하는 작업입니다. y_Train_df.loc[y_Train_df[y_val] > np.mean(y_Train_df[y_val]), y_val] = 1 y_Train_df.loc[y_Train_df[y_val] > 1, y_val] = 0 model = sm.Logit(y_Train_df, X_Train_df).fit() get_simple_results(model, X_Predict_df, y_val, Predict) elif model_name == 'MNlogit': # 다중 로지스틱 model = sm.MNLogit(y_Train_df, X_Train_df).fit() get_simple_results(model, X_Predict_df, y_val, Predict) elif model_name == 'OLS': # 선형회귀 model = sm.OLS(y_Train_df, X_Train_df).fit() get_simple_results(model, X_Predict_df, y_val, Predict) elif model_name == 'Random_fore': # 랜덤포레스트 model = RandomForestRegressor(max_depth=2, random_state=0).fit( X_Train_df, y_Train_df) get_model_results(model, X_Predict_df) # 현재 우리가 필요한 문제 auto_reg로 자동화 회귀모델링 # auto 모델의 경우 predict를 할 수 있는 reg와 분류작업을 위한 classifi를 직접 지정받아야하는 부분입니다. elif model_name == 'Auto_classi': model = GoClassify(n_best=1).train(X_Train_df, y_Train_df) get_model_results(model, X_Predict_df) elif model_name == 'Auto_reg': model = GoRegress(n_best=1).train(X_Train_df, y_Train_df) get_model_results(model, X_Predict_df) # 신경망 (Deep learning) elif model_name == 'Neural_net': # scaling 하는 또다른 방법. 적용하였으면 추후 재 되돌리는 코드 필요. LSTM 코드 참조 #sc = StandardScaler() #X_Train_df = sc.fit_transform(X_Train_df) #y_Train_df = sc.fit_transform(y_Train_df) #X_Predict_df = sc.fit_transform(X_Predict_df) #X_Predict_df = sc.fit_transform(y_Predict_df) # Initialising the ANN model = Sequential() # Adding the input layer and the first hidden layer model.add( Dense(10, activation='relu', kernel_initializer='normal', input_dim=X_Train_df.shape[1])) # Adding the second hidden layer model.add(Dense(units=8, activation='relu')) # model.add(Dropout(0.5)) # Adding the third hidden layer # model.add(Dense(units = 4, activation = 'relu')) # 레이어 추가 # model.add(Dropout(0.5)) # Adding the output layer model.add(Dense(units=1, activation='relu')) model.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['accuracy']) model.fit(X_Train_df, y_Train_df, batch_size=10, epochs=150, verbose=0) # callback 안함. 필요시 LSTM 코드 참조 추가 get_neural_results(model, X_Predict_df) else: print('Please select your data model')
profit += x_test.iloc[i]['odd_home'] profit -= 1 if x_test.iloc[i]['prediction'] < 0.3 and x_test.iloc[i]['prediction'] > 0. and x_test.iloc[i]['odd_away'] > 1.4: if x_test.iloc[i]['result'] == 0: profit += x_test.iloc[i]['odd_away'] profit -= 1 print('Profit_rf: ', profit) predict_columns = ['elo', 'elo_recent', 'elo_surf', 'prob_g', 'prob_g_rec', 'lose12', 'p_gamma', 'p_gamma_rec', 'p_gamma_surf', 'p_gamma_time', 'set_score', 'match_score', 'p_gamma_rec_p5', 'p_gamma_rec_m5', 'd_dif', 'freq_home', 'freq_away', 'fatigue_home', 'fatigue_away', 'win_perc', 'set_perc', 'game_perc', '1st_lose_win', '1st_win_lose', 'p_gamma_simple', 'p_gamma_simplest', 'p_gamma_simple_surf', 'p_gamma_simplest_surf', 'age_dif'] model = Sequential() model.add(Dense(32, input_dim=29, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy']) model.fit(x_train[predict_columns].values, y_train.values, epochs = 400, batch_size = 10) scores = model.evaluate(x_train[predict_columns], y_train) y_pred_keras = model.predict_proba(x_test[predict_columns]) print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) x_test['prediction_keras'] = y_pred_keras