def train_predict_reg(X_train, y_train, X_val, params, weights=None): if weights is None: weights = np.ones(len(y_train)) features = list(X_train.columns) X_val = X_val.copy() X_train = X_train.copy() if 'reg_skl_etr' in params["model"]: X_train = X_train.fillna(-999.0) X_val = X_val.fillna(-999.0) X_train = X_train.replace(-np.inf, -10000) X_train = X_train.replace(np.inf, 10000) X_val = X_val.replace(-np.inf, -10000) X_val = X_val.replace(np.inf, 10000) clf = ExtraTreesRegressor(n_estimators=int(params['n_estimators']), min_samples_leaf=max(1, int(params['min_samples_leaf'])), max_features=params['max_features'], max_depth=None if not params.has_key('max_depth') else int(params['max_depth']), random_state=params['random_state'], n_jobs=params['n_jobs']) clf.fit(X_train, y_train) features = list(X_train.columns) print sorted(zip(features, clf.feature_importances_), key=lambda x: x[1], reverse=True) y_val_prob = clf.predict(X_val) return clf, y_val_prob, None if 'reg_skl_rfr' in params["model"]: X_train = X_train.fillna(-999.0) X_val = X_val.fillna(-999.0) X_train = X_train.replace(-np.inf, -10000) X_train = X_train.replace(np.inf, 10000) X_val = X_val.replace(-np.inf, -10000) X_val = X_val.replace(np.inf, 10000) clf = RandomForestRegressor(n_estimators=int(params['n_estimators']), min_samples_leaf=max(1, int(params['min_samples_leaf'])), max_features=params['max_features'], max_depth=None if not params.has_key('max_depth') else int(params['max_depth']), random_state=params['random_state'], n_jobs=params['n_jobs']) clf.fit(X_train, y_train) features = list(X_train.columns) print sorted(zip(features, clf.feature_importances_), key=lambda x: x[1], reverse=True) y_val_prob = clf.predict(X_val) return clf, y_val_prob, None if params["model"] == 'reg_keras_dnn': X_train = X_train.replace([np.inf, -np.inf], np.nan) X_val = X_val.replace([np.inf, -np.inf], np.nan) X_train = X_train.fillna(X_train.mean()) X_val = X_val.fillna(X_val.mean()) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_val = scaler.transform(X_val) y_scaler = StandardScaler(with_std=False) y_train = y_scaler.fit_transform(y_train) model = Sequential() # ## input layer model.add(Dropout(params["input_dropout"], input_shape=[X_train.shape[1]])) hidden_layers = params['hidden_layers'] units = params["hidden_units"] while hidden_layers > 0: model.add(Dense(units, init='glorot_uniform')) if params["batch_norm"]: model.add(BatchNormalization()) if params["hidden_activation"] == "prelu": model.add(PReLU()) else: model.add(Activation(params['hidden_activation'])) model.add(Dropout(params["hidden_dropout"])) hidden_layers -= 1 model.add(Dense(33, init='glorot_uniform', activation='sigmoid')) # ## output layer model.add(Dense(33, init='glorot_uniform', activation='linear')) model.compile(loss='mean_squared_error', optimizer='adam') ## to array X_train_ndarray = X_train y_train_ndarray = y_train X_val_ndarray = X_val X_es_train, X_es_eval, y_es_train, y_es_eval = train_test_split(X_train, y_train, test_size=0.1, random_state=0) if params['early_stopping']: earlyStopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=2, mode='auto') ## train model.fit(X_es_train, y_es_train, nb_epoch=params['nb_epoch'], batch_size=params['batch_size'], callbacks=[earlyStopping], validation_data=[X_es_eval, y_es_eval], verbose=2) else: model.fit(X_train_ndarray, y_train_ndarray, nb_epoch=params['nb_epoch'], batch_size=params['batch_size'], validation_split=0.1, verbose=2) ##prediction pred = model.predict(X_val_ndarray, verbose=0) pred = y_scaler.inverse_transform(pred) return model, pred, None if params["model"] == 'reg_keras_lstm': scaler = StandardScaler() X_train = pd.DataFrame(scaler.fit_transform((X_train.fillna(0).values)), columns=X_train.columns, index=X_train.index) X_val = pd.DataFrame(scaler.transform(X_val.fillna(0).values), columns=X_val.columns, index=X_val.index) num_units = params["hidden_units"] sequence_length = params['sequence_length'] input_dim = X_train.shape[1] output_dim = y_train.shape[1] batch_size = params['batch_size'] backwards = params['backwards'] if 'backwards' in params else False print "SPECS:" print " num_units (LSTM)", num_units print " sequence_length", sequence_length print " input_dim (X)", input_dim print " output_dim (Y)", output_dim print " batch_size", batch_size print "X_train len", len(X_train) start = len(X_train.index) % (batch_size * sequence_length) X_train_Window_Generator = slidingWindow(X_train.iloc[start:], sequence_length) # , 10, 1) Y_train_Window_Generator = slidingWindow(y_train.iloc[start:], sequence_length) # , 10, 1) print('Build model...') model = Sequential() model.add(LSTM(num_units, batch_input_shape=(batch_size, sequence_length, input_dim), return_sequences=True, stateful=True, go_backwards=backwards)) if "2-lstm" in params: model.add(TimeDistributed(Dense(num_units, activation='relu'))) model.add(LSTM(num_units, batch_input_shape=(batch_size, sequence_length, input_dim), return_sequences=True, stateful=True, go_backwards=backwards)) model.add(TimeDistributed(Dense(num_units, activation='relu'))) model.add(Dropout(params['hidden_dropout'])) model.add(TimeDistributed(Dense(32, activation='sigmoid'))) model.add(TimeDistributed(Dense(output_dim, activation='linear'))) model.compile(loss='mse', optimizer='rmsprop') X_seq = list(X_train_Window_Generator) Y_seq = list(Y_train_Window_Generator) if backwards: X_seq.reverse() Y_seq.reverse() model.fit(X_seq, Y_seq, batch_size=batch_size, verbose=1, nb_epoch=params['nb_epoch'], shuffle=False) model = model batch_size = params['batch_size'] sequence_length = params['sequence_length'] # merge the train and the test X_merged = pd.concat([X_train, X_val]) print len(X_merged.index) start = len(X_merged.index) % (batch_size * sequence_length) X_train_Window_Generator = slidingWindow(X_merged.iloc[start:], sequence_length) # , 10, 1) dataX = list(X_train_Window_Generator) if backwards: dataX.reverse() Y_hat = model.predict(dataX, batch_size=batch_size, verbose=1) # now get the tail of Y_hat Y_hat1 = np.vstack(Y_hat) if backwards: Y_hat1=Y_hat1[::-1,:] res = Y_hat1[-len(X_val.index):, :] return None, res, None if params["model"] in ['reg_xgb_tree']: X_trainsets = [] y_train_sets = [] X_testssets = [] for ix, col in enumerate(config.target_cols): X_train_col = X_train.copy() X_test_col = X_val.copy() X_train_col['out'] = ix X_test_col['out'] = ix X_testssets.append(X_test_col) X_trainsets.append(X_train_col) y_train_sets.append(y_train[col].values) X_train = pd.concat(X_trainsets) X_val = pd.concat(X_testssets) y_train = np.concatenate(y_train_sets) X_train_xgb = X_train.fillna(-999.0) X_val_xgb = X_val.fillna(-999.0) params['num_round'] = max(params['num_round'], 10) params['nthread'] = params['n_jobs'] params['seed'] = params['random_state'] X_es_train, X_es_eval, y_es_train, y_es_eval = train_test_split(X_train_xgb, y_train, test_size=0.2, random_state=0) dvalid_base = xgb.DMatrix(X_es_eval, label=y_es_eval, feature_names=list(X_es_eval.columns)) dtrain_base = xgb.DMatrix(X_es_train, label=y_es_train, feature_names=list(X_es_eval.columns)) dtest_base = xgb.DMatrix(X_val_xgb, feature_names=list(X_es_eval.columns)) watchlist = [(dtrain_base, 'train'), (dvalid_base, 'valid')] if params['early_stopping'] == True: model = xgb.train(params, dtrain_base, int(params['num_round']), watchlist, early_stopping_rounds=20) else: model = xgb.train(params, dtrain_base, int(params['num_round']), watchlist) importance = model.get_fscore() importance = sorted(importance.items(), key=operator.itemgetter(1)) print importance y_val_prob = model.predict(dtest_base) y_val_prob = y_val_prob.reshape((len(config.target_cols), -1)).T y_train_prob = None return model, y_val_prob, y_train_prob