def main(): # Load processed data # You could use the following script to generate a well-processed train and test data sets: # https://www.kaggle.com/yassinealouini/predicting-red-hat-business-value/features-processing # I have only used the .head() of the data sets since the process takes a long time to run. # I have also put the act_train and act_test data sets since I don't have the processed data sets # loaded. print("Loading data...") tour = 89 nrows = 1000 nrows = None # Load the data from the CSV files training_data = data_helper.load_training_data(tour, nrows=nrows) # training_data = training_data.sample(frac=0.1) #prediction_data = data_helper.load_testing_data(tour, nrows=nrows) X, y = data_helper.get_Xy(training_data) #-------------------------------------------------# # Extract the train and valid (used for validation) dataframes from the train_df #-------------------------------------------------# # Run the optimization # Trials object where the history of search will be stored # For the time being, there is a bug with the following version of hyperopt. # You can read the error messag on the log file. # For the curious, you can read more about it here: https://github.com/hyperopt/hyperopt/issues/234 # => So I am commenting it. trials = Trials() # score = make_scorer(X, y) score = make_cv_scorer(X, y, cv=5) best_hyperparams = optimize(score, trials, max_evals=50) print("The best hyperparameters are:") print(best_hyperparams) print('trials:') for t in trials.trials: print(t)
def train(nn): nrows = 30 nrows = None training_data = data_helper.load_training_data(tour, nrows=nrows) # training_data = data_helper.get_random_data(nrows=5, nfeat=3) # training_data = data_helper.get_random_data() frac = None #frac = 0.1 #frac = 0.01 #frac = 0.0001 if frac is not None: training_data = training_data.sample(frac=frac) X, y = data_helper.get_Xy(training_data, onehot=True) batch_count = 50000 batch_size = 5000 train_model(nn, X, y, batch_size=batch_size, batch_count=batch_count)
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") tour = 90 nrows = None # Load the data from the CSV files training_data = data_helper.load_training_data(tour, nrows=nrows) #training_data = pd.read_csv(data_dir + '/numerai_training_data.csv', header=0) #prediction_data = pd.read_csv(data_dir + '/numerai_tournament_data.csv', header=0) #prediction_data = data_helper.load_testing_data(tour, nrows=nrows) features = data_helper.get_feature_names(training_data) # Transform the loaded CSV data into numpy arrays frac = 0.1 frac = 0.01 frac = 0.0001 frac = None if frac is not None: training_data = training_data.sample(frac=frac) X, y = data_helper.get_Xy(training_data) xgb1 = xgb.XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) #modelfit(xgb1, X, y) tune1(xgb1, X, y)
import tensorflow as tf #import numpy as np import seq2seq import pickle #from tensorflow.python.layers.core import Dense #%% ## first, load and pad data ## load all data and vocabulary vocab_path = os.path.join(config.PROCESSED_PATH, 'vocab.p') train_token_path = os.path.join(config.PROCESSED_PATH, 'processed_tokens.p') vocab_to_int, int_to_vocab = helper.load_vocab(vocab_path) config.source_vocab_size = len(vocab_to_int) config.target_vocab_size = len(vocab_to_int) train_enc_tokens, train_dec_tokens, test_enc_tokens, test_dec_tokens = helper.load_training_data( train_token_path) bucket_ids = helper.bucket_training_data(train_enc_tokens, config.max_conv_length) batches = helper.make_batches_of_bucket_ids(bucket_ids, config.batch_size) ## get a batch of data nd pad them #%% ## build the network # create inpute place holder input_data, targets, lr, keep_prob, target_sequence_length, max_target_sequence_length, source_sequence_length, hrnn_sequence_length = seq2seq.model_inputs( ) # get input shape input_shape = tf.shape(input_data)
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") nrows = None # Load the data from the CSV files training_data = data_helper.load_training_data(tour, nrows=nrows) #training_data = pd.read_csv(data_dir + '/numerai_training_data.csv', header=0) #prediction_data = pd.read_csv(data_dir + '/numerai_tournament_data.csv', header=0) prediction_data = data_helper.load_testing_data(tour, nrows=nrows) features = data_helper.get_feature_names(training_data) # Transform the loaded CSV data into numpy arrays frac = 0.1 frac = 0.01 frac = 0.0001 frac = None if frac is not None: training_data = training_data.sample(frac=frac) X, y = data_helper.get_Xy(training_data) # This is your model that will learn to predict rfc = RandomForestClassifier( #max_depth=None, max_depth=10, n_estimators=20, max_features=1, min_samples_split=500, min_samples_leaf=50, n_jobs=1) adbr = AdaBoostClassifier() #clf = svm.SVC(kernel='poly', gamma=1, probability=True) svc_linear = clf = svm.SVC(kernel='linear', gamma=0.1, probability=True) #clf = svm.SVC(kernel='rbf', gamma=1, probability=True) #clf = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)), # ('logistic', linear_model.LogisticRegression(fit_intercept=True))]) lr = linear_model.LogisticRegression() lr1 = linear_model.LogisticRegression(C=1000) xgbr = XGBClassifier(n_jobs=2) xgbr_linear = XGBClassifier(n_jobs=2, booster='gblinear') xgbr2 = XGBClassifier(max_depth=3, n_estimators=200) xgbr_opt_param = { 'eval_metric': 'logloss', 'objective': 'binary:logistic', # Increase this number if you have more cores. Otherwise, remove it and it will default # to the maxium number. 'nthread': 2, 'booster': 'gbtree', 'tree_method': 'exact', 'silent': 1, 'seed': SEED, } tmp = { 'eta': 0.07500000000000001, 'n_estimators': 21.0, 'gamma': 0.75, 'colsample_bytree': 0.8500000000000001, 'min_child_weight': 4.0, 'subsample': 0.75, 'max_depth': 3 } xgbr_opt_param.update(tmp) xgbr_opt = XGBClassifier(**(xgboost_param_convert(xgbr_opt_param))) #model = (Cutter(method='qcut', nbins=100), # OneHotEncoder(handle_unknown='ignore')) #model += (clf,) models = [ #('logistic', lr), #('logistic-C1000', lr1), #('linear SVC', svc_linear), too slow #('randomforest', rfc), #('adaboost', adbr), #('xgbr-default', xgbr), ('xgbr-linear', xgbr_linear), # ('xgbr-200', xgbr2), # ('xgbr-opt', xgbr_opt), ] perfs = train_eval_model(models, X, y, prediction_data, features=features) print_perfs(models, perfs)