def make_KL_slip(fault,num_modes,eigenvals,V,mean_slip,max_slip,lognormal=True,maxiter=5,seed=12345): ''' Make slip map using num_modes ''' from numpy import sqrt,exp from numpy.random import randn from numpy.random import seed as random_seed iterations=0 success=False while True: #Generate random numbers #Is there a seed? if seed != None: random_seed(seed) if len(fault)>num_modes: z = randn(num_modes) else: #if fewer faults than requested modes then use all modes z = randn(len(fault)) KL_slip = mean_slip.copy() # start with the mean slip # add in the terms in the K-L expansion: for k in range(len(z)): KL_slip += z[k] * sqrt(eigenvals[k]) * V[:,k] # exponentiate for lognormal: if lognormal==True: KL_slip = exp(KL_slip) #Check if max_slip condition is met, if so then you're done if KL_slip.max()<=max_slip: success=True break iterations+=1 if iterations>maxiter: print'... ... ... improper eigenvalues, recalculating...' break return KL_slip,success
from __future__ import print_function, division, unicode_literals import os from collections import Counter import re import numpy as np from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand from numpy import zeros as np_zeros # pylint:disable=no-name-in-module from keras.models import Sequential from keras.engine.training import slice_X from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout from keras.layers import recurrent random_seed(123) # Reproducibility # Parameters for the model and dataset NUMBER_OF_ITERATIONS = 20000 EPOCHS_PER_ITERATION = 5 RNN = recurrent.LSTM INPUT_LAYERS = 2 OUTPUT_LAYERS = 2 AMOUNT_OF_DROPOUT = 0.3 BATCH_SIZE = 500 HIDDEN_SIZE = 700 INITIALIZATION = "he_normal" # : Gaussian initialization scaled by fan_in (He et al., 2014) MAX_INPUT_LEN = 40 MIN_INPUT_LEN = 3 INVERTED = True AMOUNT_OF_NOISE = 0.2 / MAX_INPUT_LEN
def work(out_csv_file, estimator, nest, njobs, nfolds, cv_grid, minimizer, nbuckets, mvector, imputer, clf_kwargs, int_fold): from numpy.random import seed as random_seed random_seed(1) from zipfile import ZipFile from pandas import read_csv,factorize from numpy import rint,clip,savetxt,stack if KAGGLE: train = read_csv("../input/train.csv") test = read_csv("../input/test.csv") else: train = read_csv(ZipFile("../../data/train.csv.zip", 'r').open('train.csv')) test = read_csv(ZipFile("../../data/test.csv.zip", 'r').open('test.csv')) # gmm17_train = read_csv('GMM_17_full_train.csv') # gmm17_test = read_csv('GMM_17_full_test.csv') # gmm6_train = read_csv('GMM_6_full_train.csv') # gmm6_test = read_csv('GMM_6_full_test.csv') # # train['GMM17'] = gmm17_train['Response'] # test['GMM17'] = gmm17_test['Response'] # train['GMM6'] = gmm6_train['Response'] # test['GMM6'] = gmm6_test['Response'] # combine train and test all_data = train.append(test) # G_vectors = read_csv('../../data/G_vectors.csv') # #all_data = all_data.join(G_vectors.drop(['G3'], axis=1)) # all_data = all_data.join( # G_vectors[['G8', 'G11', 'G12', 'G13', 'G17', 'G18', 'G19', 'G20']]) from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='median', axis=0) all_data[DISCRETE] = imp.fit_transform(all_data[DISCRETE]) # from numpy import bincount # for col in all_data[DISCRETE]: # top = bincount(all_data[col].astype(int)).argmax() # all_data[col] -= top imp = Imputer(missing_values='NaN', strategy='median', axis=0) all_data[CONTINUOUS] = imp.fit_transform(all_data[CONTINUOUS]) # all_data[BOOLEANS] = all_data[BOOLEANS] + 1e6 # from sklearn.preprocessing import StandardScaler # from sklearn.decomposition import PCA # std = StandardScaler(copy=True) # all_data[CONTINUOUS] = std.fit_transform(all_data[CONTINUOUS]) # pca = PCA(whiten=False, copy=True) # all_data[CONTINUOUS] = pca.fit_transform(all_data[CONTINUOUS]) # create any new variables all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0] all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1] # factorize categorical variables all_data['Product_Info_2'] = factorize(all_data['Product_Info_2'])[0]# + 1 all_data['Product_Info_2_char'] = factorize(all_data['Product_Info_2_char'])[0]# + 1 all_data['Product_Info_2_num'] = factorize(all_data['Product_Info_2_num'])[0]# + 1 """ Both: 0.65576 BmiAge: 0.65578 MedCount: 0.65638 None: 0.65529 """ all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age'] med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')] all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1) """ print('BOOLEANS:') for col in all_data[BOOLEANS]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) print('DISCRETE:') for col in all_data[DISCRETE]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) print('CONTINUOUS:') for col in all_data[CONTINUOUS]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) print('NOMINALS:') for col in all_data[NOMINALS]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) return """ # Use -1 for any others if imputer is None: all_data.fillna(-1, inplace=True) else: all_data['Response'].fillna(-1, inplace=True) # fix the dtype on the label column all_data['Response'] = all_data['Response'].astype(int) # split train and test train = all_data[all_data['Response'] > 0].copy() test = all_data[all_data['Response'] < 1].copy() #dropped_cols = ['Id', 'Response', 'Medical_History_10', 'Medical_History_24']#, 'Medical_History_32'] dropped_cols = ['Id', 'Response'] train_y = train['Response'].values train_X = train.drop(dropped_cols, axis=1) test_X = test.drop(dropped_cols, axis=1) if imputer is not None: from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy=imputer, axis=0) train_X = imp.fit_transform(train_X) test_X = imp.transform(test_X) prudential_kwargs = \ { 'objective': 'reg:linear', 'learning_rate': 0.045, 'min_child_weight': 50, 'subsample': 0.8, 'colsample_bytree': 0.7, 'max_depth': 7, 'n_estimators': nest, 'nthread': njobs, 'seed': 0, 'n_buckets': nbuckets, 'initial_params': mvector, 'minimizer': minimizer, 'scoring': NegQWKappaScorer } if estimator == 'PrudentialRegressorCVO2FO' or estimator == 'PrudentialRegressorCVO2': prudential_kwargs['int_fold'] = int_fold pass # override kwargs with any changes for k, v in clf_kwargs.items(): prudential_kwargs[k] = v clf = globals()[estimator](**prudential_kwargs) print(estimator, clf.get_params()) if nfolds > 1: param_grid = { 'n_estimators': [700], 'max_depth': [6], 'colsample_bytree': [0.67], 'subsample': [0.9], 'min_child_weight': [240], #'initial_params': [[-0.71238755, -1.4970176, -1.73800531, -1.13361266, -0.82986203, -0.06473039, 0.69008725, 0.94815881]] } for k, v in cv_grid.items(): param_grid[k] = v from sklearn.metrics import make_scorer MIN, MAX = (1, 8) qwkappa = make_scorer(Kappa, weights='quadratic', min_rating=MIN, max_rating=MAX) from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring=qwkappa, n_jobs=1, verbose=1, refit=False) grid.fit(train_X, train_y) print('grid scores:') for item in grid.grid_scores_: print(' {:s}'.format(item)) print('best score: {:.5f}'.format(grid.best_score_)) print('best params:', grid.best_params_) pass else: clf.fit(train_X, train_y) final_test_preds = clf.predict(test_X) final_test_preds = rint(clip(final_test_preds, 1, 8)) savetxt(out_csv_file, stack(zip(test['Id'].values, final_test_preds), axis=1).T, delimiter=',', fmt=['%d', '%d'], header='"Id","Response"', comments='') importance = clf.xgb.booster().get_fscore() import operator print(sorted(importance.items()), "\n") importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) print(importance, "\n") features = [k for k, _ in importance] print(len(features), features) return
def randn_data(self, seed, shape): """ Build a block of testing data from numpy.random.randn. """ random_seed(seed) return randn(*shape)
def fit(self, X, Y, X_val, Y_val, epochs=100): """Fit the model to data matrix X and target(s) y. Parameters ---------- X : array-like, shape (n_samples, n_channels, n_eeg_samples) The input data. y : array-like, shape (n_samples,) The target values (class labels). :param X: shape (n_samples, n_channels, n_eeg_samples) The input data. :param Y: shape (n_samples,) The target values (class labels) :param X_val: shape (n_samples, n_channels, n_eeg_samples) The validation input data. :param Y_val: shape (n_samples,) The validation target values (class labels) :param epochs: Number of training epochs Returns ------- self : returns a trained RecCnn model. :return: self : returns a trained RecCnn model. """ # for reproducability # note: it still won't be too reproducible if you use GPUs, for more see: # https://github.com/keras-team/keras/issues/2479#issuecomment-213987747 random_seed(self.seed) set_random_seed(self.seed) # calculates the class weights of the dataset w = len(np.extract(Y == 0, Y)) w_0 = 1 / (len(np.extract(Y == 0, Y)) / w) w_1 = 1 / (len(np.extract(Y == 1, Y)) / w) class_weight = {0: w_0, 1: w_1} # Save best model only, based on the training loss. Saves model to topology specific file in working directory saveBestModel = ModelCheckpoint(self.path + '.h5', monitor='loss', verbose=1, save_best_only=True, mode='auto') # Log the training metrics in a topology specific file in the working directory csv_logger = CSVLogger(self.path + '_log.csv', append=True, separator=';') if self.recurrent: self.model = self.build_model_rcnn(X, depth=self.conv_depth, num_features=self.num_features) else: self.model = self.build_model_cnn(X, depth=self.conv_depth, num_features=self.num_features) if self.save_model: self.model.fit(x=X, y=Y, batch_size=64, epochs=epochs, verbose=1, class_weight=class_weight, shuffle=True, validation_data=(X_val, Y_val), callbacks=[saveBestModel, csv_logger]) else: self.model.fit(x=X, y=Y, batch_size=64, epochs=epochs, verbose=1, class_weight=class_weight, shuffle=True, validation_data=(X_val, Y_val))
import sys import os import struct import argparse import collections from os import listdir from os.path import isfile, join from numpy.random import seed as random_seed from numpy.random import shuffle as random_shuffle import tensorflow as tf from tensorflow.core.example import example_pb2 random_seed(123) # for separating the sentences in the .bin files SENTENCE_START = '<s>' SENTENCE_END = '</s>' VOCAB_SIZE = 60000 CHUNK_SIZE = 1000 def ParseStory(story_file): lines = [] with open(story_file, "r") as f: for line in f: if line.strip()!='': lines.append(line.strip())
from __future__ import print_function, division, unicode_literals import os from collections import Counter import re import numpy as np from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand from numpy import zeros as np_zeros # pylint:disable=no-name-in-module from keras.models import Sequential, slice_X from keras.layers.core import Activation, TimeDistributedDense, RepeatVector, Dropout from keras.layers import recurrent random_seed(123) # Reproducibility # Parameters for the model and dataset NUMBER_OF_ITERATIONS = 20000 EPOCHS_PER_ITERATION = 5 RNN = recurrent.LSTM INPUT_LAYERS = 2 OUTPUT_LAYERS = 2 AMOUNT_OF_DROPOUT = 0.3 BATCH_SIZE = 500 HIDDEN_SIZE = 700 INITIALIZATION = "he_normal" # : Gaussian initialization scaled by fan_in (He et al., 2014) MAX_INPUT_LEN = 40 MIN_INPUT_LEN = 3 INVERTED = True AMOUNT_OF_NOISE = 0.2 / MAX_INPUT_LEN
def work(estimator, nest, njobs, nfolds, cv_grid, clf_kwargs, do_hyperopt): from numpy.random import seed as random_seed random_seed(1) from pandas import read_csv train = read_csv('../../data/example_data.csv') train = train.drop(TO_DROP, axis=1) for col in FACTORIZABLE: from pandas import factorize train[col] = factorize(train[col])[0] pass train = ParseDates(train, ['TRANSACTION_DATE', 'CUSTOMER_FIRST_ORDER_DATE']) symbols = {} for col in [ 'PRICE_METHOD', 'ORDER_SOURCE', 'CUSTOMER_ACCOUNT_TYPE', 'CUSTOMER_MANAGED_LEVEL', 'CUSTOMER_TYPE2', 'CUSTOMER_TYPE1', 'CUSTOMER_ZIP', 'CUSTOMER_NUMBER' ]: uniq = set(train[col]) symbols[col] = list(uniq) pass grouped = train.groupby(['PRODUCT_NUMBER', 'CUSTOMER_SEGMENT1']) samples = [] for k, df in grouped: #print('{' + '"{}", \'{}\''.format(k[0], 'B' if k[1] else 'A') + '},') sample = Fractions(df, symbols, [ 'PRICE_METHOD', 'ORDER_SOURCE', 'CUSTOMER_ACCOUNT_TYPE', 'CUSTOMER_MANAGED_LEVEL', 'CUSTOMER_TYPE2', 'CUSTOMER_TYPE1', #'CUSTOMER_ZIP', 'CUSTOMER_NUMBER' ]) ATTRIBUTES2 = ['PRODUCT_CLASS_ID1', 'BRAND', # binary 'PRODUCT_SALES_UNIT', # binary 'PRODUCT_UNIT_OF_MEASURE', 'SPECIAL_PART' ] sample = sample.append(df.iloc[0][ATTRIBUTES2]) ######################## boxes_sold = df['TOTAL_BOXES_SOLD'] pcost1 = df['PRODUCT_COST1'].abs() pcost1_per_item = pcost1 / boxes_sold pcost1_mean = pcost1_per_item.mean() pcost1_std = pcost1_per_item.std() sample.set_value('PCOST1_REL_STD', pcost1_std / pcost1_mean) sample.set_value('PCOST1_REL_MAX', pcost1_per_item.max() / pcost1_mean) sample.set_value('PCOST1_REL_MIN', pcost1_per_item.min() / pcost1_mean) price = df['PRODUCT_PRICE'].abs() price_mean = price.mean() price_std = price.std() sample.set_value('PRICE_REL_STD', price_std / price_mean) sample.set_value('PRICE_REL_MAX', price.max() / price_mean) sample.set_value('PRICE_REL_MIN', price.min() / price_mean) if sample['PRODUCT_UNIT_OF_MEASURE'] < 2: commision = price / pcost1_per_item else: commision = df['GROSS_SALES'].abs() / pcost1 commision_mean = commision.mean() commision_std = commision.std() sample.set_value('COMMN_MEAN', commision_mean) sample.set_value('COMMN_REL_STD', commision_std / commision_mean) sample.set_value('COMMN_REL_MAX', commision.max() / commision_mean) sample.set_value('COMMN_REL_MIN', commision.min() / commision_mean) # tx_days = df['TRANSACTION_DATE_1'].combine( # df[['TRANSACTION_DATE_2', 'TRANSACTION_DATE_3']], # func=lambda y, m_d: (y - 1970)* 365 + m_d['TRANSACTION_DATE_2'] * 30 + m_d['TRANSACTION_DATE_3'] - 1) # sample.set_value('FIRST_TX', tx_days.min()) # august2014 = (2014 - 1970) * 365 + 8 * 30 # sample.set_value('LAST_365D_TX', (tx_days > august2014).sum()) # monthly = df['TRANSACTION_DATE_2'].value_counts(normalize=True) # monthly = monthly.reindex([i + 1 for i in range(12)], fill_value=0.) # sample.set_value('TX_Q1', monthly[[1, 2, 3]].sum()) # sample.set_value('TX_Q2', monthly[[4, 5, 6]].sum()) # sample.set_value('TX_Q3', monthly[[7, 8, 9]].sum()) # sample.set_value('TX_Q4', monthly[[10, 11, 12]].sum()) # sample = sample.append(monthly.rename(lambda i: 'TX_M_' + str(i))) # # tx_days = df['TRANSACTION_DATE_1'].combine( # df[['TRANSACTION_DATE_2', 'TRANSACTION_DATE_3']], # func=lambda y, m_d: y * 365 + m_d['TRANSACTION_DATE_2'] * 30 + m_d['TRANSACTION_DATE_3']) # tx_days.sort() # delta_tx_days = tx_days.diff() # means_delta_tx_days = delta_tx_days.mean() # sample.set_value('DTX_DAYS_MEAN', means_delta_tx_days) # sample.set_value('DTX_DAYS_REL_STD', delta_tx_days.std() / means_delta_tx_days) ######################## # #most frequent customer # custcounts = df['CUSTOMER_NUMBER'].value_counts() # topcust = custcounts.index[0] # sample.set_value('TOP_CUST', topcust) # # most frequent zip # zipcounts = df['CUSTOMER_ZIP'].value_counts() # topzip = zipcounts.index[0] # sample.set_value('TOP_ZIP', topzip) # # number of unique transactions # sample.set_value('NTRANS', len(df)) # # number of unique customers # custcounts = df['CUSTOMER_NUMBER'].value_counts() # sample.set_value('NCUST', len(custcounts)) #sample = sample.append(df.iloc[0][['SPECIAL_PART']]) samples.append(sample) pass from pandas import DataFrame train_df = DataFrame.from_records(samples) train_y = train_df['SPECIAL_PART'].values train_X = train_df.drop(['SPECIAL_PART'], axis=1) train_keys = [k for k, _ in grouped] from numpy import digitize train_y = digitize(train_y, [0.5]) avnet_kwargs = \ { #'objective': 'reg:logistic', 'objective': 'rank:pairwise', 'learning_rate': 0.045, 'min_child_weight': 50, 'subsample': 1.0, 'colsample_bytree': 1.0, 'max_depth': 7, 'n_estimators': nest, 'nthread': njobs, 'seed': 0, #'cache_opt': 1, 'missing': float('nan') #'scoring': NegQWKappaScorer } # override kwargs with any changes for k, v in clf_kwargs.items(): avnet_kwargs[k] = v pass # create model instance from xgb_sklearn import XGBClassifier if estimator == 'XGBClassifier': clf = XGBClassifier(**avnet_kwargs) pass else: clf = globals()[estimator](**avnet_kwargs) pass from sklearn.metrics import make_scorer tco_scorer = make_scorer(AvnetScorer) if do_hyperopt: def objective(space): param_grid = {'objective': ['binary:logistic']} #param_grid = {'objective': ['binary:logitraw']} #param_grid = {'objective': ['rank:pairwise']} #param_grid = {'objective': ['rank:pairwise'], 'booster_type': ['gblinear']} for k, v in space.items(): if k in ['n_estimators', 'max_depth', 'min_child_weight', 'num_pairwise']: v = int(v) pass param_grid[k] = [v] pass from sklearn.cross_validation import StratifiedKFold, LeaveOneOut from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import _PartitionIterator class CustomLOO(_PartitionIterator): def __init__(self, train_keys): ids = set(t[0] for t in train_keys) self.n_folds = len(ids) self.n = len(train_keys) from numpy import zeros, array test_folds = zeros(len(train_keys)) for i, k in enumerate(ids): mask = [t[0] == k for t in train_keys] test_folds[array(mask)] = i pass self.test_folds = test_folds pass #def _iter_test_indices(self): # return range(self.n_folds) def _iter_test_masks(self): for i in range(self.n_folds): yield self.test_folds == i def __len__(self): return self.n_folds pass grid = GridSearchCV(estimator=clf, param_grid=param_grid, #cv=StratifiedKFold(train_y, n_folds=nfolds), #cv=LeaveOneOut(91), cv=CustomLOO(train_keys), scoring=tco_scorer, n_jobs=1, #verbose=2, refit=False) grid.fit(train_X, train_y) print('best score: {:.5f} best params: {}'.format(grid.best_score_, grid.best_params_)) return -grid.best_score_ from sys import path as sys_path sys_path.insert(0, './hyperopt') from hyperopt import fmin, tpe, hp # cheatsheet: # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions space = { 'n_estimators': hp.quniform("x_n_estimators", 2, 32, 1), 'max_depth': hp.quniform("x_max_depth", 1, 24, 1), 'min_child_weight': hp.quniform ('x_min_child', 1, 16, 1), #'gamma': hp.uniform ('x_gamma', 0.0, 2.0), 'scale_pos_weight': hp.uniform ('x_scale_pos_weight', 0.2, 1.0), #'num_pairsample': hp.quniform ('x_num_pairsample', 1, 20, 1), #'learning_rate': hp.uniform ('x_learning_rate', 0.03, 0.06), 'subsample': hp.uniform ('x_subsample', 0.8, 1.0), 'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.3, 1.0) } print(clf) print(space) best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=1000, ) print(best) pass return """ best score: 956593.40659 best params: {'colsample_bytree': 0.6964853661142929, 'min_child_weight': 2, 'n_estimators': 160, 'subsample': 0.9904670890953792, 'objective': 'rank:pairwise', 'max_depth': 8, 'gamma': 0.663344866861138} {'x_gamma': 0.66334486686113803, 'x_min_child': 2.0, 'x_max_depth': 8.0, 'x_subsample': 0.99046708909537917, 'x_colsample_bytree': 0.6964853661142929, 'x_n_estimators': 160.0} """ """ Model crossvalidation """ if (False #or True ): param_grid = { #'objective': ['binary:logitraw'], 'objective': ['rank:pairwise'], #'booster': ['gblinear'], 'n_estimators': [580], 'max_depth': [6], 'min_child_weight': [45], 'gamma': [0.], 'subsample': [0.85], 'colsample_bytree': [0.65], 'learning_rate': [0.045], } for k, v in cv_grid.items(): param_grid[k] = v from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring='roc_auc', n_jobs=1, verbose=2, refit=False) grid.fit(train_X, train_y) print('grid scores:') for item in grid.grid_scores_: print(' {:s}'.format(item)) print('best score: {:.5f}'.format(grid.best_score_)) print('best params:', grid.best_params_) """ Print feature importances """ if (False #or True ): clf.fit(train_X, train_y) feature_names = train_X.columns.values.tolist() from numpy import zeros feature_importances = zeros(len(feature_names)) importances = clf.booster().get_fscore() for i, feat in enumerate(feature_names): if feat in importances: feature_importances[i] += importances[feat] pass pass import operator sorted_importances = sorted(zip(feature_names, feature_importances), key=operator.itemgetter(1), reverse=True) for k, v in sorted_importances: print("{}\t{}".format(v, k)) pass print([k for k, v in sorted_importances if v == 0]) pass """ Hyperopt """ return
def work(estimator, nest, njobs, nfolds, cv_grid, clf_kwargs): from numpy.random import seed as random_seed random_seed(1) from pandas import read_csv all_data = read_csv("../../data/demographic_membership_training.csv") train_y = all_data['DEMO_X'].values train_X = all_data.drop(['CONSUMER_ID', 'DEMO_X'], axis=1) from pandas import factorize train_X['GENDER'][train_X['GENDER'] == 'U'] = float('nan') for col in FACTORIZABLE: from pandas import isnull missing = isnull(train_X[col]) train_X[col] = factorize(train_X[col])[0] ## NANs become -1 train_X[col][missing] = float('nan') from numpy import isnan print("NANs after factorization", sum(train_X[col].apply(isnan))) pass train_X = OneHot(train_X, NOMINALS) #train_X['**PROP_PAGE_IMPRESSIONS_DWELL'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['TOTAL_DWELL'] #train_X['**PROP_VOD_VIEWS_DWELL'] = train_X['VOD_VIEWS_DWELL'] / train_X['TOTAL_DWELL'] # # train_X['**FLAG_WARD_WKDAY_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKDAY_')]].sum(axis=1) # train_X['**FLAG_WARD_WKEND_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKEND_')]].sum(axis=1) # train_X['**FLAG_UNI_CLUSTER_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_UNI_CLUSTER_')]].sum(axis=1) # train_X['**INTERESTS_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('INTEREST_')]].sum(axis=1) # train_X['**AGE_25'] = train_X['AGE'] < 25 # train_X['**AGE_30'] = train_X['AGE'] < 30 # train_X['**AGE_35'] = train_X['AGE'] < 35 # train_X['**AGE_40'] = train_X['AGE'] < 40 # train_X['**AGE_45'] = train_X['AGE'] < 45 #train_X['**PAGE_IMP_DWELL_PER_DAY'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['REGISTRATION_DAYS'] #train_X['**LATE_PAGE_VIEWS_PER_DAY'] = train_X['LATE_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] #train_X['**TOTAL_DWELL_PER_DAY'] = train_X['TOTAL_DWELL'] / train_X['REGISTRATION_DAYS'] #train_X['**AFTERNOON_PAGE_VIEWS_PER_DAY'] = train_X['AFTERNOON_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] #train_X['**PAGE_IMPRESSION_VISITS_PER_DAY'] = train_X['PAGE_IMPRESSION_VISITS'] / train_X['REGISTRATION_DAYS'] #train_X['**LUNCHTIME_PAGE_VIEWS_PER_DAY'] = train_X['LUNCHTIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] #train_X['**NIGHT_TIME_PAGE_VIEWS_PER_DAY'] = train_X['NIGHT_TIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] # train_X['**BREAKFAST_PAGE_VIEWS_PER_DAY'] = train_X['BREAKFAST_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] # train_X['**VIDEO_STOPS_PER_DAY'] = train_X['VIDEO_STOPS'] / train_X['REGISTRATION_DAYS'] # TO_DROP = [ # 'VIEWS_AFF4', 'FLAG_WARD_WKDAY_10_16', 'FLAG_WARD_WKDAY_17_19', # 'FLAG_WARD_WKDAY_20_24', 'FLAG_WARD_WKEND_10_13', 'FLAG_WARD_WKEND_14_20', # 'FLAG_UNI_CLUSTER_15', 'FLAG_UNI_CLUSTER_23', 'FLAG_UNI_CLUSTER_28', # 'FLAG_UNI_CLUSTER_29', 'FLAG_UNI_CLUSTER_33', 'FLAG_WEBSITE', # 'FLAG_BREAKFAST_VIEWS', 'FLAG_LUNCHTIME_VIEWS', 'FLAG_AFTERNOON_VIEWS', # 'FLAG_CATCHUP_VIEWS', 'FLAG_ARCHIVE_VIEWS', 'FLAG_AFF3', 'FLAG_AFF4', # 'REGISTRATION_ROUTE_3', 'REGISTRATION_ROUTE_4', 'REGISTRATION_CONTEXT_3', # 'REGISTRATION_CONTEXT_6', 'REGISTRATION_CONTEXT_8', 'REGISTRATION_CONTEXT_9', # 'REGISTRATION_CONTEXT_10', 'REGISTRATION_CONTEXT_11', 'REGISTRATION_CONTEXT_12', # 'REGISTRATION_CONTEXT_13', 'REGISTRATION_CONTEXT_14', 'REGISTRATION_CONTEXT_15', # 'REGISTRATION_CONTEXT_16', 'REGISTRATION_CONTEXT_17', 'REGISTRATION_CONTEXT_18', # 'REGISTRATION_CONTEXT_19', 'REGISTRATION_CONTEXT_20', 'REGISTRATION_CONTEXT_21', # 'REGISTRATION_CONTEXT_22', 'REGISTRATION_CONTEXT_23', 'REGISTRATION_CONTEXT_24', # 'REGISTRATION_CONTEXT_25', 'REGISTRATION_CONTEXT_26', 'REGISTRATION_CONTEXT_27', # 'MIGRATED_USER_TYPE_5', 'TOD_CENTRE_3', 'CONTENT_CENTRE_1', 'CONTENT_CENTRE_2', # 'CONTENT_CENTRE_4', 'CONTENT_CENTRE_5', 'CONTENT_CENTRE_6', 'CONTENT_CENTRE_7', # 'CONTENT_CENTRE_8', 'CONTENT_CENTRE_9', 'CONTENT_CENTRE_12','CONTENT_CENTRE_13', # 'CONTENT_CENTRE_15'] # TO_DROP += [ # 'SOCIAL_AUTH_TWITTER', 'FLAG_WARD_WKEND_3_9', 'FLAG_UNI_CLUSTER_7', # 'FLAG_UNI_CLUSTER_13', 'FLAG_UNI_CLUSTER_21', 'FLAG_UNI_CLUSTER_22', # 'FLAG_UNI_CLUSTER_25', 'FLAG_ANDROID', 'FLAG_LATE_PEAK_VIEWS', # 'FLAG_NIGHT_TIME_VIEWS', 'FLAG_AFF1', 'FLAG_AFF2', 'MIGRATED_USER_TYPE_4', # 'CONTENT_CENTRE_10', 'CONTENT_CENTRE_14', 'CONTENT_CENTRE_16'] # TO_DROP += [ # 'FLAG_WARD_WKDAY_3_9', 'FLAG_UNI_CLUSTER_5', 'FLAG_UNI_CLUSTER_8', # 'FLAG_UNI_CLUSTER_9', 'FLAG_UNI_CLUSTER_17', 'FLAG_UNI_CLUSTER_26', # 'FLAG_MORNING_VIEWS', 'FLAG_EARLY_PEAK_VIEWS'] # TO_DROP += [ # 'FLAG_WARD_WKEND_1_2', 'FLAG_WARD_WKEND_21_24', 'FLAG_UNI_CLUSTER_1', # 'FLAG_UNI_CLUSTER_14', 'FLAG_MAIN', 'FLAG_OTHER_VIEWS', 'CONTENT_CENTRE_11'] # TO_DROP += ['FLAG_UNI_CLUSTER_12', 'FLAG_UNI_CLUSTER_19', 'FLAG_UNI_CLUSTER_27'] # TO_DROP += ['FLAG_UNI_CLUSTER_11', 'FLAG_UNI_CLUSTER_24'] ## 814000 ? # TO_DROP += ['FLAG_UNI_CLUSTER_2', 'FLAG_UNI_CLUSTER_16'] # TO_DROP += ['FLAG_UNI_CLUSTER_10', 'FLAG_UNI_CLUSTER_31', 'FLAG_POST_PEAK_VIEWS', 'TOD_CENTRE_2'] # TO_DROP += ['FLAG_UNI_CLUSTER_30'] # train_X = train_X.drop(TO_DROP, axis=1) # train_X.fillna(-1, inplace=True) from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV if (False #or True ): skf = StratifiedKFold(train_y, n_folds=nfolds) from numpy import asarray selection = asarray(['-'] * len(train_y)) symbol = 0 for train_index, test_index in skf: selection[test_index] = chr(symbol + 48) symbol += 1 pass print(''.join(selection)) return muse_kwargs = \ { #'objective': 'reg:logistic', 'objective': 'rank:pairwise', 'learning_rate': 0.045, 'min_child_weight': 50, 'subsample': 0.8, 'colsample_bytree': 0.7, 'max_depth': 7, 'n_estimators': nest, 'nthread': njobs, 'seed': 0, 'missing': float('nan') #'scoring': NegQWKappaScorer } # override kwargs with any changes for k, v in clf_kwargs.items(): muse_kwargs[k] = v pass #clf = globals()[estimator](**muse_kwargs) #from xgboost import XGBClassifier clf = XGBClassifier(**muse_kwargs) #clf = MillenialsClassifier(**muse_kwargs) from sklearn.metrics import make_scorer tco_scorer = make_scorer(MinPRScorer) """ binary:logistic grid scores: mean: 787812.76918, std: 1297.55109, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 1} mean: 789084.73195, std: 1925.75110, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 3} mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10} mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50} mean: 788168.38281, std: 928.87371, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80} best score: 789958.10747 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} reg:logistic (to samo co wyżej) grid scores: mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10} mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50} best score: 789958.10747 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} grid scores: mean: 786388.90860, std: 906.72660, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 300, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} mean: 789050.88848, std: 1708.63378, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} mean: 789454.57872, std: 2059.68811, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} best score: 789454.57872 best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} ====================================================== rank:pairwise grid scores: mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} best score: 806358.37855 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} grid scores: mean: 750119.43597, std: 9120.06057, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 4, 'min_child_weight': 20} mean: 809673.54959, std: 4784.35577, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20} mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20} mean: 794998.50356, std: 2029.93836, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 20, 'min_child_weight': 20} mean: 794548.01245, std: 2062.41505, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 50, 'min_child_weight': 20} best score: 809673.54959 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20} >>> 'max_depth': 6, 'min_child_weight': 20 grid scores: mean: 802508.37926, std: 4201.47242, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 10} mean: 793935.52998, std: 7607.45918, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 20} mean: 784568.74090, std: 7161.04235, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 40} mean: 802325.99222, std: 1833.64884, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10} mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} mean: 808437.63308, std: 3881.55687, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40} mean: 798618.25778, std: 2948.03146, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 10} mean: 802665.25722, std: 2350.85430, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 20} mean: 806720.10926, std: 2543.82598, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 40} mean: 795701.38488, std: 2962.99442, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 10} mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20} mean: 803385.26027, std: 2271.86591, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 40} best score: 808437.63308 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40} >>> 'max_depth': 7, 'min_child_weight': 40 grid scores: mean: 782028.41606, std: 9637.64116, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 50} mean: 769010.75894, std: 6079.16367, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 60} mean: 760914.24094, std: 9643.26515, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 80} mean: 807557.88495, std: 4219.13250, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 50} mean: 801663.63876, std: 7556.18492, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 60} mean: 784727.73532, std: 7314.95469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 80} mean: 811735.94787, std: 3476.37280, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50} mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60} mean: 806342.26320, std: 7227.93062, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80} best score: 812694.98649 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60} >>> 'max_depth': 7, 'min_child_weight': 60 grid scores: mean: 811261.01220, std: 1387.81968, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 55} mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60} mean: 813522.63431, std: 5054.98775, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65} mean: 811147.14498, std: 1469.98812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 70} mean: 810716.38989, std: 3383.29928, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 55} mean: 810977.37920, std: 3039.52816, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 60} mean: 809034.76724, std: 4751.72859, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 65} mean: 810902.03165, std: 3741.53151, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 70} best score: 813522.63431 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65} >>> 'max_depth': 7, 'min_child_weight': 65 """ """ ONE HOT *** grid scores: mean: 808785.95756, std: 3732.33890, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 807606.64678, std: 6685.61758, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 798856.94075, std: 8380.25083, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 1.0, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812217.95285 best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'subsample': 0.8 grid scores: mean: 807015.44496, std: 1745.61053, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811229.10339, std: 2626.00511, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809107.13182, std: 3766.10287, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 806700.86249, std: 1673.53048, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 808803.17397, std: 2141.65596, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 810538.01687, std: 4532.84397, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 808655.96400, std: 2917.13000, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809115.42707, std: 2625.94051, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809472.09619, std: 3144.06367, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 811229.10339 best params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'colsample_bytree': 0.6, 'subsample': 0.8 grid scores: mean: 804112.81441, std: 8632.61400, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 807667.00596, std: 2109.00871, params: {'colsample_bytree': 0.67, 'learning_rate': 0.06, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812217.95285 best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'learning_rate': 0.045 grid scores: mean: 810936.40241, std: 2661.32895, params: {'colsample_bytree': 0.67, 'learning_rate': 0.04, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811906.13100, std: 3339.45916, params: {'colsample_bytree': 0.67, 'learning_rate': 0.05, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812217.95285 best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'learning_rate': 0.045 grid scores: mean: 811208.11854, std: 3435.62254, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809690.89826, std: 2189.70344, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 808096.24696, std: 3457.47957, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811478.50583, std: 2864.64292, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'learning_rate': 0.045, 'colsample_bytree': 0.65, 'subsample': 0.85 grid scores: mean: 810759.12376, std: 1528.10128, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 50, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811977.26246, std: 4991.06664, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 80, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} >>> no change grid scores: mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 810059.35718, std: 1824.61383, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 1.0} mean: 810675.99076, std: 3153.29552, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 2.0} mean: 809731.68599, std: 4091.21405, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 4} mean: 751547.94084, std: 10351.41628, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 10} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} >>> 'gamma': 0 grid scores: mean: 786632.68012, std: 7192.58756, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.4, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 746308.67706, std: 11236.07404, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.3, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 695506.87990, std: 7719.89820, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.2, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 802604.50597, std: 2869.94386, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 798443.86047, std: 1863.64175, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} >>> 'base_score': 0.5 """ param_grid = { #'objective': ['binary:logitraw'], 'objective': ['rank:pairwise'], #'booster': ['gblinear'], 'n_estimators': [580], 'max_depth': [6], 'min_child_weight': [45, 50, 55], 'gamma': [0.], 'subsample': [0.85], 'colsample_bytree': [0.65], 'learning_rate': [0.045], } for k, v in cv_grid.items(): param_grid[k] = v grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring=tco_scorer, n_jobs=1, verbose=2, refit=False) if (False #or True ): grid.fit(train_X, train_y) print('grid scores:') for item in grid.grid_scores_: print(' {:s}'.format(item)) print('best score: {:.5f}'.format(grid.best_score_)) print('best params:', grid.best_params_) if (False #or True ): clf.fit(train_X, train_y) feature_names = train_X.columns.values.tolist() from numpy import zeros feature_importances = zeros(len(feature_names)) importances = clf.booster().get_fscore() for i, feat in enumerate(feature_names): if feat in importances: feature_importances[i] += importances[feat] pass pass import operator sorted_importances = sorted(zip(feature_names, feature_importances), key=operator.itemgetter(1), reverse=True) for k, v in sorted_importances: print("{}\t{}".format(v, k)) pass print([k for k, v in sorted_importances if v == 0]) pass if (False or True): def objective(space): #param_grid = {'objective': ['binary:logistic']} #param_grid = {'objective': ['binary:logitraw']} param_grid = { 'objective': ['rank:pairwise'], 'booster_type': ['gblinear'] } for k, v in space.items(): if k in [ 'n_estimators', 'max_depth', 'min_child_weight', 'num_pairwise' ]: v = int(v) pass param_grid[k] = [v] pass grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring=tco_scorer, n_jobs=1, verbose=2, refit=False) grid.fit(train_X, train_y) print('best score: {:.5f} best params: {}'.format( grid.best_score_, grid.best_params_)) return -grid.best_score_ from sys import path as sys_path sys_path.insert(0, './hyperopt') from hyperopt import fmin, tpe, hp # cheatsheet: # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions space = { 'n_estimators': hp.quniform("x_n_estimators", 500, 800, 10), 'max_depth': hp.quniform("x_max_depth", 4, 8, 1), 'min_child_weight': hp.quniform('x_min_child', 45, 240, 5), 'gamma': hp.uniform('x_gamma', 0.0, 2.0), #'scale_pos_weight': hp.uniform ('x_scale_pos_weight', 0.5, 1.0), 'num_pairsample': hp.quniform('x_num_pairsample', 1, 4, 1), 'subsample': hp.uniform('x_subsample', 0.4, 1.0), 'colsample_bytree': hp.uniform('x_colsample_bytree', 0.4, 1.0) } best = fmin( fn=objective, space=space, algo=tpe.suggest, max_evals=100, ) print(best) pass return
def work(out_csv_file, estimator, nest, njobs, nfolds, cv_grid, minimizer, nbuckets, mvector, imputer, clf_kwargs, int_fold): from numpy.random import seed as random_seed random_seed(1) from zipfile import ZipFile from pandas import read_csv, factorize from numpy import rint, clip, savetxt, stack if KAGGLE: train = read_csv("../input/train.csv") test = read_csv("../input/test.csv") else: train = read_csv( ZipFile("../../data/train.csv.zip", 'r').open('train.csv')) test = read_csv( ZipFile("../../data/test.csv.zip", 'r').open('test.csv')) # gmm17_train = read_csv('GMM_17_full_train.csv') # gmm17_test = read_csv('GMM_17_full_test.csv') # gmm6_train = read_csv('GMM_6_full_train.csv') # gmm6_test = read_csv('GMM_6_full_test.csv') # # train['GMM17'] = gmm17_train['Response'] # test['GMM17'] = gmm17_test['Response'] # train['GMM6'] = gmm6_train['Response'] # test['GMM6'] = gmm6_test['Response'] # combine train and test all_data = train.append(test) # G_vectors = read_csv('../../data/G_vectors.csv') # #all_data = all_data.join(G_vectors.drop(['G3'], axis=1)) # all_data = all_data.join( # G_vectors[['G8', 'G11', 'G12', 'G13', 'G17', 'G18', 'G19', 'G20']]) from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='median', axis=0) all_data[DISCRETE] = imp.fit_transform(all_data[DISCRETE]) # from numpy import bincount # for col in all_data[DISCRETE]: # top = bincount(all_data[col].astype(int)).argmax() # all_data[col] -= top imp = Imputer(missing_values='NaN', strategy='median', axis=0) all_data[CONTINUOUS] = imp.fit_transform(all_data[CONTINUOUS]) # all_data[BOOLEANS] = all_data[BOOLEANS] + 1e6 # from sklearn.preprocessing import StandardScaler # from sklearn.decomposition import PCA # std = StandardScaler(copy=True) # all_data[CONTINUOUS] = std.fit_transform(all_data[CONTINUOUS]) # pca = PCA(whiten=False, copy=True) # all_data[CONTINUOUS] = pca.fit_transform(all_data[CONTINUOUS]) # create any new variables all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0] all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1] # factorize categorical variables all_data['Product_Info_2'] = factorize( all_data['Product_Info_2'])[0] # + 1 all_data['Product_Info_2_char'] = factorize( all_data['Product_Info_2_char'])[0] # + 1 all_data['Product_Info_2_num'] = factorize( all_data['Product_Info_2_num'])[0] # + 1 """ Both: 0.65576 BmiAge: 0.65578 MedCount: 0.65638 None: 0.65529 """ all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age'] med_keyword_columns = all_data.columns[all_data.columns.str.startswith( 'Medical_Keyword_')] all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1) """ print('BOOLEANS:') for col in all_data[BOOLEANS]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) print('DISCRETE:') for col in all_data[DISCRETE]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) print('CONTINUOUS:') for col in all_data[CONTINUOUS]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) print('NOMINALS:') for col in all_data[NOMINALS]: print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col])) return """ # Use -1 for any others if imputer is None: all_data.fillna(-1, inplace=True) else: all_data['Response'].fillna(-1, inplace=True) # fix the dtype on the label column all_data['Response'] = all_data['Response'].astype(int) # split train and test train = all_data[all_data['Response'] > 0].copy() test = all_data[all_data['Response'] < 1].copy() #dropped_cols = ['Id', 'Response', 'Medical_History_10', 'Medical_History_24']#, 'Medical_History_32'] dropped_cols = ['Id', 'Response'] train_y = train['Response'].values train_X = train.drop(dropped_cols, axis=1) test_X = test.drop(dropped_cols, axis=1) if imputer is not None: from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy=imputer, axis=0) train_X = imp.fit_transform(train_X) test_X = imp.transform(test_X) prudential_kwargs = \ { 'objective': 'reg:linear', 'learning_rate': 0.045, 'min_child_weight': 50, 'subsample': 0.8, 'colsample_bytree': 0.7, 'max_depth': 7, 'n_estimators': nest, 'nthread': njobs, 'seed': 0, 'n_buckets': nbuckets, 'initial_params': mvector, 'minimizer': minimizer, 'scoring': NegQWKappaScorer } if estimator == 'PrudentialRegressorCVO2FO' or estimator == 'PrudentialRegressorCVO2': prudential_kwargs['int_fold'] = int_fold pass # override kwargs with any changes for k, v in clf_kwargs.items(): prudential_kwargs[k] = v clf = globals()[estimator](**prudential_kwargs) print(estimator, clf.get_params()) if nfolds > 1: param_grid = { 'n_estimators': [700], 'max_depth': [6], 'colsample_bytree': [0.67], 'subsample': [0.9], 'min_child_weight': [240], #'initial_params': [[-0.71238755, -1.4970176, -1.73800531, -1.13361266, -0.82986203, -0.06473039, 0.69008725, 0.94815881]] } for k, v in cv_grid.items(): param_grid[k] = v from sklearn.metrics import make_scorer MIN, MAX = (1, 8) qwkappa = make_scorer(Kappa, weights='quadratic', min_rating=MIN, max_rating=MAX) from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring=qwkappa, n_jobs=1, verbose=1, refit=False) grid.fit(train_X, train_y) print('grid scores:') for item in grid.grid_scores_: print(' {:s}'.format(item)) print('best score: {:.5f}'.format(grid.best_score_)) print('best params:', grid.best_params_) pass else: clf.fit(train_X, train_y) final_test_preds = clf.predict(test_X) final_test_preds = rint(clip(final_test_preds, 1, 8)) savetxt(out_csv_file, stack(zip(test['Id'].values, final_test_preds), axis=1).T, delimiter=',', fmt=['%d', '%d'], header='"Id","Response"', comments='') importance = clf.xgb.booster().get_fscore() import operator print(sorted(importance.items()), "\n") importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) print(importance, "\n") features = [k for k, _ in importance] print(len(features), features) return
import os from os.path import isfile, join from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize # from progressbar import progressbar import tensorflow as tf from tensorflow.core.example import example_pb2 from numpy.random import seed as random_seed from numpy.random import shuffle as random_shuffle random_seed(21) #Reproducibil #Define the Global Variables FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('command', 'text_to_binary', 'Either text_to_vocabulary or text_to_binary.' 'Specify FLAGS.in_directories accordingly.') tf.app.flags.DEFINE_string('in_folder','','path to input json data file') tf.app.flags.DEFINE_string('out_files', '','comma seperated paths to files') #specify the outfile during command line interface tf.app.flags.DEFINE_string('split', '', 'comma separated fractions of data') #specify during terminal command call # tf.app.flags.DEFINE_string('vocab_file','data/vocabulary','path to output the vocab of the training corpus') # tf.app.flags.DEFINE_integer('body_len', 30000, 'Define the length of body to consider') # tf.app.flags.DEFINE_integer('abs_len', 1500, 'Define the length of abstract to consider') tf.app.flags.DEFINE_integer('max_words', 500000000, 'Define the max number of words to consider in vocab') # # UNKNOWN_TOKEN = '<UNK>'
def work(estimator, nest, njobs, nfolds, cv_grid, clf_kwargs): from numpy.random import seed as random_seed random_seed(1) from pandas import read_csv all_data = read_csv("../../data/demographic_membership_training.csv") train_y = all_data['DEMO_X'].values train_X = all_data.drop(['CONSUMER_ID', 'DEMO_X'], axis=1) from pandas import factorize train_X['GENDER'][train_X['GENDER'] == 'U'] = float('nan') for col in FACTORIZABLE: from pandas import isnull missing = isnull(train_X[col]) train_X[col] = factorize(train_X[col])[0] ## NANs become -1 train_X[col][missing] = float('nan') from numpy import isnan print("NANs after factorization", sum(train_X[col].apply(isnan))) pass train_X = OneHot(train_X, NOMINALS) #train_X['**PROP_PAGE_IMPRESSIONS_DWELL'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['TOTAL_DWELL'] #train_X['**PROP_VOD_VIEWS_DWELL'] = train_X['VOD_VIEWS_DWELL'] / train_X['TOTAL_DWELL'] # # train_X['**FLAG_WARD_WKDAY_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKDAY_')]].sum(axis=1) # train_X['**FLAG_WARD_WKEND_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKEND_')]].sum(axis=1) # train_X['**FLAG_UNI_CLUSTER_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_UNI_CLUSTER_')]].sum(axis=1) # train_X['**INTERESTS_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('INTEREST_')]].sum(axis=1) # train_X['**AGE_25'] = train_X['AGE'] < 25 # train_X['**AGE_30'] = train_X['AGE'] < 30 # train_X['**AGE_35'] = train_X['AGE'] < 35 # train_X['**AGE_40'] = train_X['AGE'] < 40 # train_X['**AGE_45'] = train_X['AGE'] < 45 #train_X['**PAGE_IMP_DWELL_PER_DAY'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['REGISTRATION_DAYS'] #train_X['**LATE_PAGE_VIEWS_PER_DAY'] = train_X['LATE_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] #train_X['**TOTAL_DWELL_PER_DAY'] = train_X['TOTAL_DWELL'] / train_X['REGISTRATION_DAYS'] #train_X['**AFTERNOON_PAGE_VIEWS_PER_DAY'] = train_X['AFTERNOON_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] #train_X['**PAGE_IMPRESSION_VISITS_PER_DAY'] = train_X['PAGE_IMPRESSION_VISITS'] / train_X['REGISTRATION_DAYS'] #train_X['**LUNCHTIME_PAGE_VIEWS_PER_DAY'] = train_X['LUNCHTIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] #train_X['**NIGHT_TIME_PAGE_VIEWS_PER_DAY'] = train_X['NIGHT_TIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] # train_X['**BREAKFAST_PAGE_VIEWS_PER_DAY'] = train_X['BREAKFAST_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS'] # train_X['**VIDEO_STOPS_PER_DAY'] = train_X['VIDEO_STOPS'] / train_X['REGISTRATION_DAYS'] # TO_DROP = [ # 'VIEWS_AFF4', 'FLAG_WARD_WKDAY_10_16', 'FLAG_WARD_WKDAY_17_19', # 'FLAG_WARD_WKDAY_20_24', 'FLAG_WARD_WKEND_10_13', 'FLAG_WARD_WKEND_14_20', # 'FLAG_UNI_CLUSTER_15', 'FLAG_UNI_CLUSTER_23', 'FLAG_UNI_CLUSTER_28', # 'FLAG_UNI_CLUSTER_29', 'FLAG_UNI_CLUSTER_33', 'FLAG_WEBSITE', # 'FLAG_BREAKFAST_VIEWS', 'FLAG_LUNCHTIME_VIEWS', 'FLAG_AFTERNOON_VIEWS', # 'FLAG_CATCHUP_VIEWS', 'FLAG_ARCHIVE_VIEWS', 'FLAG_AFF3', 'FLAG_AFF4', # 'REGISTRATION_ROUTE_3', 'REGISTRATION_ROUTE_4', 'REGISTRATION_CONTEXT_3', # 'REGISTRATION_CONTEXT_6', 'REGISTRATION_CONTEXT_8', 'REGISTRATION_CONTEXT_9', # 'REGISTRATION_CONTEXT_10', 'REGISTRATION_CONTEXT_11', 'REGISTRATION_CONTEXT_12', # 'REGISTRATION_CONTEXT_13', 'REGISTRATION_CONTEXT_14', 'REGISTRATION_CONTEXT_15', # 'REGISTRATION_CONTEXT_16', 'REGISTRATION_CONTEXT_17', 'REGISTRATION_CONTEXT_18', # 'REGISTRATION_CONTEXT_19', 'REGISTRATION_CONTEXT_20', 'REGISTRATION_CONTEXT_21', # 'REGISTRATION_CONTEXT_22', 'REGISTRATION_CONTEXT_23', 'REGISTRATION_CONTEXT_24', # 'REGISTRATION_CONTEXT_25', 'REGISTRATION_CONTEXT_26', 'REGISTRATION_CONTEXT_27', # 'MIGRATED_USER_TYPE_5', 'TOD_CENTRE_3', 'CONTENT_CENTRE_1', 'CONTENT_CENTRE_2', # 'CONTENT_CENTRE_4', 'CONTENT_CENTRE_5', 'CONTENT_CENTRE_6', 'CONTENT_CENTRE_7', # 'CONTENT_CENTRE_8', 'CONTENT_CENTRE_9', 'CONTENT_CENTRE_12','CONTENT_CENTRE_13', # 'CONTENT_CENTRE_15'] # TO_DROP += [ # 'SOCIAL_AUTH_TWITTER', 'FLAG_WARD_WKEND_3_9', 'FLAG_UNI_CLUSTER_7', # 'FLAG_UNI_CLUSTER_13', 'FLAG_UNI_CLUSTER_21', 'FLAG_UNI_CLUSTER_22', # 'FLAG_UNI_CLUSTER_25', 'FLAG_ANDROID', 'FLAG_LATE_PEAK_VIEWS', # 'FLAG_NIGHT_TIME_VIEWS', 'FLAG_AFF1', 'FLAG_AFF2', 'MIGRATED_USER_TYPE_4', # 'CONTENT_CENTRE_10', 'CONTENT_CENTRE_14', 'CONTENT_CENTRE_16'] # TO_DROP += [ # 'FLAG_WARD_WKDAY_3_9', 'FLAG_UNI_CLUSTER_5', 'FLAG_UNI_CLUSTER_8', # 'FLAG_UNI_CLUSTER_9', 'FLAG_UNI_CLUSTER_17', 'FLAG_UNI_CLUSTER_26', # 'FLAG_MORNING_VIEWS', 'FLAG_EARLY_PEAK_VIEWS'] # TO_DROP += [ # 'FLAG_WARD_WKEND_1_2', 'FLAG_WARD_WKEND_21_24', 'FLAG_UNI_CLUSTER_1', # 'FLAG_UNI_CLUSTER_14', 'FLAG_MAIN', 'FLAG_OTHER_VIEWS', 'CONTENT_CENTRE_11'] # TO_DROP += ['FLAG_UNI_CLUSTER_12', 'FLAG_UNI_CLUSTER_19', 'FLAG_UNI_CLUSTER_27'] # TO_DROP += ['FLAG_UNI_CLUSTER_11', 'FLAG_UNI_CLUSTER_24'] ## 814000 ? # TO_DROP += ['FLAG_UNI_CLUSTER_2', 'FLAG_UNI_CLUSTER_16'] # TO_DROP += ['FLAG_UNI_CLUSTER_10', 'FLAG_UNI_CLUSTER_31', 'FLAG_POST_PEAK_VIEWS', 'TOD_CENTRE_2'] # TO_DROP += ['FLAG_UNI_CLUSTER_30'] # train_X = train_X.drop(TO_DROP, axis=1) # train_X.fillna(-1, inplace=True) from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV if (False #or True ): skf = StratifiedKFold(train_y, n_folds=nfolds) from numpy import asarray selection = asarray(['-'] * len(train_y)) symbol = 0 for train_index, test_index in skf: selection[test_index] =chr(symbol + 48) symbol += 1 pass print(''.join(selection)) return muse_kwargs = \ { #'objective': 'reg:logistic', 'objective': 'rank:pairwise', 'learning_rate': 0.045, 'min_child_weight': 50, 'subsample': 0.8, 'colsample_bytree': 0.7, 'max_depth': 7, 'n_estimators': nest, 'nthread': njobs, 'seed': 0, 'missing': float('nan') #'scoring': NegQWKappaScorer } # override kwargs with any changes for k, v in clf_kwargs.items(): muse_kwargs[k] = v pass #clf = globals()[estimator](**muse_kwargs) #from xgboost import XGBClassifier clf = XGBClassifier(**muse_kwargs) #clf = MillenialsClassifier(**muse_kwargs) from sklearn.metrics import make_scorer tco_scorer = make_scorer(MinPRScorer) """ binary:logistic grid scores: mean: 787812.76918, std: 1297.55109, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 1} mean: 789084.73195, std: 1925.75110, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 3} mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10} mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50} mean: 788168.38281, std: 928.87371, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80} best score: 789958.10747 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} reg:logistic (to samo co wyżej) grid scores: mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10} mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50} best score: 789958.10747 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} grid scores: mean: 786388.90860, std: 906.72660, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 300, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} mean: 789050.88848, std: 1708.63378, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} mean: 789454.57872, std: 2059.68811, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} best score: 789454.57872 best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7} ====================================================== rank:pairwise grid scores: mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} best score: 806358.37855 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} grid scores: mean: 750119.43597, std: 9120.06057, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 4, 'min_child_weight': 20} mean: 809673.54959, std: 4784.35577, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20} mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20} mean: 794998.50356, std: 2029.93836, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 20, 'min_child_weight': 20} mean: 794548.01245, std: 2062.41505, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 50, 'min_child_weight': 20} best score: 809673.54959 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20} >>> 'max_depth': 6, 'min_child_weight': 20 grid scores: mean: 802508.37926, std: 4201.47242, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 10} mean: 793935.52998, std: 7607.45918, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 20} mean: 784568.74090, std: 7161.04235, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 40} mean: 802325.99222, std: 1833.64884, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10} mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20} mean: 808437.63308, std: 3881.55687, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40} mean: 798618.25778, std: 2948.03146, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 10} mean: 802665.25722, std: 2350.85430, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 20} mean: 806720.10926, std: 2543.82598, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 40} mean: 795701.38488, std: 2962.99442, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 10} mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20} mean: 803385.26027, std: 2271.86591, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 40} best score: 808437.63308 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40} >>> 'max_depth': 7, 'min_child_weight': 40 grid scores: mean: 782028.41606, std: 9637.64116, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 50} mean: 769010.75894, std: 6079.16367, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 60} mean: 760914.24094, std: 9643.26515, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 80} mean: 807557.88495, std: 4219.13250, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 50} mean: 801663.63876, std: 7556.18492, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 60} mean: 784727.73532, std: 7314.95469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 80} mean: 811735.94787, std: 3476.37280, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50} mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60} mean: 806342.26320, std: 7227.93062, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80} best score: 812694.98649 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60} >>> 'max_depth': 7, 'min_child_weight': 60 grid scores: mean: 811261.01220, std: 1387.81968, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 55} mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60} mean: 813522.63431, std: 5054.98775, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65} mean: 811147.14498, std: 1469.98812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 70} mean: 810716.38989, std: 3383.29928, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 55} mean: 810977.37920, std: 3039.52816, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 60} mean: 809034.76724, std: 4751.72859, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 65} mean: 810902.03165, std: 3741.53151, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 70} best score: 813522.63431 best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65} >>> 'max_depth': 7, 'min_child_weight': 65 """ """ ONE HOT *** grid scores: mean: 808785.95756, std: 3732.33890, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 807606.64678, std: 6685.61758, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 798856.94075, std: 8380.25083, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 1.0, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812217.95285 best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'subsample': 0.8 grid scores: mean: 807015.44496, std: 1745.61053, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811229.10339, std: 2626.00511, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809107.13182, std: 3766.10287, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 806700.86249, std: 1673.53048, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 808803.17397, std: 2141.65596, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 810538.01687, std: 4532.84397, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 808655.96400, std: 2917.13000, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809115.42707, std: 2625.94051, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809472.09619, std: 3144.06367, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 811229.10339 best params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'colsample_bytree': 0.6, 'subsample': 0.8 grid scores: mean: 804112.81441, std: 8632.61400, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 807667.00596, std: 2109.00871, params: {'colsample_bytree': 0.67, 'learning_rate': 0.06, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812217.95285 best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'learning_rate': 0.045 grid scores: mean: 810936.40241, std: 2661.32895, params: {'colsample_bytree': 0.67, 'learning_rate': 0.04, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811906.13100, std: 3339.45916, params: {'colsample_bytree': 0.67, 'learning_rate': 0.05, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812217.95285 best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'learning_rate': 0.045 grid scores: mean: 811208.11854, std: 3435.62254, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 809690.89826, std: 2189.70344, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 808096.24696, std: 3457.47957, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811478.50583, std: 2864.64292, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} >>> 'learning_rate': 0.045, 'colsample_bytree': 0.65, 'subsample': 0.85 grid scores: mean: 810759.12376, std: 1528.10128, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 50, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} mean: 811977.26246, std: 4991.06664, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 80, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7} >>> no change grid scores: mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 810059.35718, std: 1824.61383, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 1.0} mean: 810675.99076, std: 3153.29552, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 2.0} mean: 809731.68599, std: 4091.21405, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 4} mean: 751547.94084, std: 10351.41628, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 10} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} >>> 'gamma': 0 grid scores: mean: 786632.68012, std: 7192.58756, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.4, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 746308.67706, std: 11236.07404, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.3, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 695506.87990, std: 7719.89820, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.2, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 802604.50597, std: 2869.94386, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} mean: 798443.86047, std: 1863.64175, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} best score: 812432.77112 best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0} >>> 'base_score': 0.5 """ param_grid = { #'objective': ['binary:logitraw'], 'objective': ['rank:pairwise'], #'booster': ['gblinear'], 'n_estimators': [580], 'max_depth': [6], 'min_child_weight': [45, 50, 55], 'gamma': [0.], 'subsample': [0.85], 'colsample_bytree': [0.65], 'learning_rate': [0.045], } for k, v in cv_grid.items(): param_grid[k] = v grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring=tco_scorer, n_jobs=1, verbose=2, refit=False) if (False #or True ): grid.fit(train_X, train_y) print('grid scores:') for item in grid.grid_scores_: print(' {:s}'.format(item)) print('best score: {:.5f}'.format(grid.best_score_)) print('best params:', grid.best_params_) if (False #or True ): clf.fit(train_X, train_y) feature_names = train_X.columns.values.tolist() from numpy import zeros feature_importances = zeros(len(feature_names)) importances = clf.booster().get_fscore() for i, feat in enumerate(feature_names): if feat in importances: feature_importances[i] += importances[feat] pass pass import operator sorted_importances = sorted(zip(feature_names, feature_importances), key=operator.itemgetter(1), reverse=True) for k, v in sorted_importances: print("{}\t{}".format(v, k)) pass print([k for k, v in sorted_importances if v == 0]) pass if (False or True ): def objective(space): #param_grid = {'objective': ['binary:logistic']} #param_grid = {'objective': ['binary:logitraw']} param_grid = {'objective': ['rank:pairwise'], 'booster_type': ['gblinear']} for k, v in space.items(): if k in ['n_estimators', 'max_depth', 'min_child_weight', 'num_pairwise']: v = int(v) pass param_grid[k] = [v] pass grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=StratifiedKFold(train_y, n_folds=nfolds), scoring=tco_scorer, n_jobs=1, verbose=2, refit=False) grid.fit(train_X, train_y) print('best score: {:.5f} best params: {}'.format(grid.best_score_, grid.best_params_)) return -grid.best_score_ from sys import path as sys_path sys_path.insert(0, './hyperopt') from hyperopt import fmin, tpe, hp # cheatsheet: # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions space = { 'n_estimators': hp.quniform("x_n_estimators", 500, 800, 10), 'max_depth': hp.quniform("x_max_depth", 4, 8, 1), 'min_child_weight': hp.quniform ('x_min_child', 45, 240, 5), 'gamma': hp.uniform ('x_gamma', 0.0, 2.0), #'scale_pos_weight': hp.uniform ('x_scale_pos_weight', 0.5, 1.0), 'num_pairsample': hp.quniform ('x_num_pairsample', 1, 4, 1), 'subsample': hp.uniform ('x_subsample', 0.4, 1.0), 'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.4, 1.0) } best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, ) print(best) pass return
import sys from time import time import os from os.path import isfile, join import itertools from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import tensorflow as tf from tensorflow.core.example import example_pb2 from numpy.random import seed as random_seed from numpy.random import shuffle as random_shuffle random_seed(21) #Reproducibility #Define the Global Variables FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string( 'command', 'text_to_binary', 'Either text_to_vocabulary or text_to_binary.' 'Specify FLAGS.in_directories accordingly.') tf.app.flags.DEFINE_string('in_folder', '', 'path to input json data file') tf.app.flags.DEFINE_string( 'out_files', '', 'comma seperated paths to files' ) #specify the outfile during command line interface tf.app.flags.DEFINE_string( 'split', '', 'comma separated fractions of data') #specify during terminal command call # tf.app.flags.DEFINE_string('vocab_file','data/vocabulary','path to output the vocab of the training corpus') # tf.app.flags.DEFINE_integer('body_len', 30000, 'Define the length of body to consider')
''' import argparse import numpy as np from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout from keras.layers import recurrent from keras.models import Sequential from keras.callbacks import ModelCheckpoint, TensorBoard, CSVLogger, LambdaCallback from numpy.random import seed as random_seed from numpy.random import randint as random_randint import os import pickle from data import DataSet random_seed(42) # Reproducibility # Parameters for the model and dataset DATASET_FILENAME = 'data/dataset/news.2011.en.shuffled' NUMBER_OF_EPOCHS = 2 RNN = recurrent.LSTM INPUT_LAYERS = 2 OUTPUT_LAYERS = 2 AMOUNT_OF_DROPOUT = 0.3 BATCH_SIZE = 32 SAMPLES_PER_EPOCH = 65536 HIDDEN_SIZE = 700 INITIALIZATION = "he_normal" # : Gaussian initialization scaled by fan_in (He et al., 2014) NUMBER_OF_CHARS = 100 # 75 CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .") INVERTED = True