Пример #1
0
def main():

    # Load processed data

    # You could use the following script to generate a well-processed train and test data sets:
    # https://www.kaggle.com/yassinealouini/predicting-red-hat-business-value/features-processing
    # I have only used the .head() of the data sets since the process takes a long time to run.
    # I have also put the act_train and act_test data sets since I don't have the processed data sets
    # loaded.

    print("Loading data...")
    tour = 89
    nrows = 1000
    nrows = None
    # Load the data from the CSV files
    training_data = data_helper.load_training_data(tour, nrows=nrows)

    # training_data = training_data.sample(frac=0.1)
    #prediction_data = data_helper.load_testing_data(tour, nrows=nrows)

    X, y = data_helper.get_Xy(training_data)

    #-------------------------------------------------#
    # Extract the train and valid (used for validation) dataframes from the train_df
    #-------------------------------------------------#

    # Run the optimization
    # Trials object where the history of search will be stored
    # For the time being, there is a bug with the following version of hyperopt.
    # You can read the error messag on the log file.
    # For the curious, you can read more about it here: https://github.com/hyperopt/hyperopt/issues/234
    # => So I am commenting it.
    trials = Trials()

    # score = make_scorer(X, y)
    score = make_cv_scorer(X, y, cv=5)
    best_hyperparams = optimize(score, trials, max_evals=50)
    print("The best hyperparameters are:")
    print(best_hyperparams)

    print('trials:')
    for t in trials.trials:
        print(t)
Пример #2
0
def train(nn):

    nrows = 30
    nrows = None
    training_data = data_helper.load_training_data(tour, nrows=nrows)
    # training_data = data_helper.get_random_data(nrows=5, nfeat=3)
    # training_data = data_helper.get_random_data()

    frac = None
    #frac = 0.1
    #frac = 0.01
    #frac = 0.0001
    if frac is not None:
        training_data = training_data.sample(frac=frac)

    X, y = data_helper.get_Xy(training_data, onehot=True)

    batch_count = 50000
    batch_size = 5000
    train_model(nn, X, y, batch_size=batch_size, batch_count=batch_count)
Пример #3
0
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    tour = 90
    nrows = None
    # Load the data from the CSV files
    training_data = data_helper.load_training_data(tour, nrows=nrows)
    #training_data = pd.read_csv(data_dir + '/numerai_training_data.csv', header=0)

    #prediction_data = pd.read_csv(data_dir + '/numerai_tournament_data.csv', header=0)
    #prediction_data = data_helper.load_testing_data(tour, nrows=nrows)

    features = data_helper.get_feature_names(training_data)

    # Transform the loaded CSV data into numpy arrays

    frac = 0.1
    frac = 0.01
    frac = 0.0001
    frac = None
    if frac is not None:
        training_data = training_data.sample(frac=frac)

    X, y = data_helper.get_Xy(training_data)

    xgb1 = xgb.XGBClassifier(
            learning_rate =0.1,
            n_estimators=1000,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=27)
    #modelfit(xgb1, X, y)
    tune1(xgb1, X, y)
Пример #4
0
import tensorflow as tf
#import numpy as np
import seq2seq
import pickle
#from tensorflow.python.layers.core import Dense

#%%

## first, load and pad data
## load all data and vocabulary
vocab_path = os.path.join(config.PROCESSED_PATH, 'vocab.p')
train_token_path = os.path.join(config.PROCESSED_PATH, 'processed_tokens.p')
vocab_to_int, int_to_vocab = helper.load_vocab(vocab_path)
config.source_vocab_size = len(vocab_to_int)
config.target_vocab_size = len(vocab_to_int)
train_enc_tokens, train_dec_tokens, test_enc_tokens, test_dec_tokens = helper.load_training_data(
    train_token_path)
bucket_ids = helper.bucket_training_data(train_enc_tokens,
                                         config.max_conv_length)
batches = helper.make_batches_of_bucket_ids(bucket_ids, config.batch_size)

## get a batch of data nd pad them

#%%
## build the network

# create inpute place holder
input_data, targets, lr, keep_prob, target_sequence_length, max_target_sequence_length, source_sequence_length, hrnn_sequence_length = seq2seq.model_inputs(
)

# get input shape
input_shape = tf.shape(input_data)
Пример #5
0
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    nrows = None
    # Load the data from the CSV files
    training_data = data_helper.load_training_data(tour, nrows=nrows)
    #training_data = pd.read_csv(data_dir + '/numerai_training_data.csv', header=0)

    #prediction_data = pd.read_csv(data_dir + '/numerai_tournament_data.csv', header=0)
    prediction_data = data_helper.load_testing_data(tour, nrows=nrows)

    features = data_helper.get_feature_names(training_data)

    # Transform the loaded CSV data into numpy arrays

    frac = 0.1
    frac = 0.01
    frac = 0.0001
    frac = None
    if frac is not None:
        training_data = training_data.sample(frac=frac)

    X, y = data_helper.get_Xy(training_data)

    # This is your model that will learn to predict
    rfc = RandomForestClassifier(
        #max_depth=None,
        max_depth=10,
        n_estimators=20,
        max_features=1,
        min_samples_split=500,
        min_samples_leaf=50,
        n_jobs=1)

    adbr = AdaBoostClassifier()

    #clf = svm.SVC(kernel='poly', gamma=1, probability=True)
    svc_linear = clf = svm.SVC(kernel='linear', gamma=0.1, probability=True)
    #clf = svm.SVC(kernel='rbf', gamma=1, probability=True)

    #clf = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
    #    ('logistic', linear_model.LogisticRegression(fit_intercept=True))])

    lr = linear_model.LogisticRegression()
    lr1 = linear_model.LogisticRegression(C=1000)

    xgbr = XGBClassifier(n_jobs=2)
    xgbr_linear = XGBClassifier(n_jobs=2, booster='gblinear')
    xgbr2 = XGBClassifier(max_depth=3, n_estimators=200)

    xgbr_opt_param = {
        'eval_metric': 'logloss',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default
        # to the maxium number.
        'nthread': 2,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': SEED,
    }
    tmp = {
        'eta': 0.07500000000000001,
        'n_estimators': 21.0,
        'gamma': 0.75,
        'colsample_bytree': 0.8500000000000001,
        'min_child_weight': 4.0,
        'subsample': 0.75,
        'max_depth': 3
    }
    xgbr_opt_param.update(tmp)
    xgbr_opt = XGBClassifier(**(xgboost_param_convert(xgbr_opt_param)))

    #model = (Cutter(method='qcut', nbins=100),
    #        OneHotEncoder(handle_unknown='ignore'))
    #model += (clf,)
    models = [
        #('logistic', lr),
        #('logistic-C1000', lr1),
        #('linear SVC', svc_linear),  too slow
        #('randomforest', rfc),
        #('adaboost', adbr),
        #('xgbr-default', xgbr),
        ('xgbr-linear', xgbr_linear),
        # ('xgbr-200', xgbr2),
        # ('xgbr-opt', xgbr_opt),
    ]

    perfs = train_eval_model(models, X, y, prediction_data, features=features)
    print_perfs(models, perfs)