def work(in_h5, out_csv_file, nest, njobs): from h5pipes import h5open from pypipes import getitem, as_key, del_key from nppipes import (as_array, fit_transform, transform, fit, predict, savetxt, stack, clip) from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from xgboost import XGBRegressor nominal_cidx = [ 0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77 ] data = ( (in_h5, ) | h5open | as_key('file') | as_key( 'train_X', lambda d: (d['file'], ) | getitem('train_X') | as_array | P.first) | as_key( 'train_y', lambda d: (d['file'], ) | getitem('train_y') | as_array | P.first) | as_key( 'test_X', lambda d: (d['file'], ) | getitem('test_X') | as_array | P.first) | as_key( 'train_labels', lambda d: (d['file'], ) | getitem('train_labels') | as_array | P.first) | as_key( 'test_labels', lambda d: (d['file'], ) | getitem('test_labels') | as_array | P.first) | as_key( 'one_hot', lambda _: (OneHotEncoder(categorical_features=nominal_cidx, sparse=False), )) | as_key( 'train_X', lambda d: (d['train_X'].copy(), ) | fit_transform(d['one_hot']) | P.first) | as_key( 'test_X', lambda d: (d['test_X'].copy(), ) | transform(d['one_hot']) | P.first) | del_key('one_hot') | as_key('std_scaler', lambda _: (StandardScaler(), )) | as_key( 'train_X', lambda d: (d['train_X'].copy(), ) | fit_transform(d['std_scaler']) | P.first) | as_key( 'test_X', lambda d: (d['test_X'].copy(), ) | transform(d['std_scaler']) | P.first) | del_key('std_scaler') | as_key( 'XGBReg', lambda d: ( XGBRegressor( seed=1, n_estimators=nest, #n_jobs=njobs, #verbose=1, #max_features=1.0, min_samples_leaf=1.0, max_depth=50), ) | fit((d['train_X'], ), (d['train_y'], )) | P.first) | as_key( 'y_hat', lambda d: (d['test_X'], ) | predict((d['XGBReg'], )) | clip(1, 8) | P.first) | del_key('XGBReg') | P.first) ((data['test_labels'], data['y_hat']) | stack(axis=1) | savetxt(out_csv_file, delimiter=',', fmt=['%d', '%d'], header='"Id","Response"', comments='') | P.first) return
def work(in_h5, out_csv_file, nest, njobs): from h5pipes import h5open from pypipes import getitem,as_key,del_key from nppipes import as_array,fit_transform,transform,fit,predict,savetxt,stack from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.ensemble import ExtraTreesRegressor nominal_cidx = [0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77] data = ( (in_h5,) | h5open | as_key('file') | as_key('train_X', lambda d: (d['file'],) | getitem('train_X') | as_array | P.first ) | as_key('train_y', lambda d: (d['file'],) | getitem('train_y') | as_array | P.first ) | as_key('test_X', lambda d: (d['file'],) | getitem('test_X') | as_array | P.first ) | as_key('train_labels', lambda d: (d['file'],) | getitem('train_labels') | as_array | P.first ) | as_key('test_labels', lambda d: (d['file'],) | getitem('test_labels') | as_array | P.first ) | as_key('one_hot', lambda _: (OneHotEncoder(categorical_features=nominal_cidx, sparse=False),)) | as_key('train_X', lambda d: (d['train_X'].copy(),) | fit_transform(d['one_hot']) | P.first ) | as_key('test_X', lambda d: (d['test_X'].copy(),) | transform(d['one_hot']) | P.first ) | del_key('one_hot') | as_key('std_scaler', lambda _: (StandardScaler(),)) | as_key('train_X', lambda d: (d['train_X'].copy(),) | fit_transform(d['std_scaler']) | P.first ) | as_key('test_X', lambda d: (d['test_X'].copy(),) | transform(d['std_scaler']) | P.first ) | del_key('std_scaler') | as_key('RFReg', lambda d: (ExtraTreesRegressor(random_state=1, n_estimators=nest, n_jobs=njobs, verbose=1, max_features=1.0, min_samples_leaf=1.0, max_depth=50),) | fit((d['train_X'],), (d['train_y'],)) | P.first ) | as_key('y_hat', lambda d: (d['test_X'],) | predict((d['RFReg'],)) | P.first ) | del_key('RFReg') | P.first ) ( (data['test_labels'], data['y_hat']) | stack(axis=1) | savetxt(out_csv_file, delimiter=',', fmt=['%d', '%d'], header='"Id","Response"', comments='') | P.first ) return
def work(): from h5pipes import h5open from pypipes import getitem, as_key from nppipes import as_array from skll import kappa data = (('raw-data.h5', ) | h5open | as_key('file') | as_key( 'train_X', lambda d: (d['file'], ) | getitem('train_X') | as_array | P.first) | as_key( 'train_y', lambda d: (d['file'], ) | getitem('train_y') | as_array | P.first) | as_key( 'test_X', lambda d: (d['file'], ) | getitem('test_X') | as_array | P.first) | as_key( 'train_labels', lambda d: (d['file'], ) | getitem('train_labels') | as_array | P.first) | as_key( 'test_labels', lambda d: (d['file'], ) | getitem('test_labels') | as_array | P.first) | P.first) nominal_cidx = [ 0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77 ] from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder(categorical_features=nominal_cidx, sparse=False) data['train_X'] = enc.fit_transform(data['train_X']) data['test_X'] = enc.transform(data['test_X']) from sklearn.preprocessing import StandardScaler ss = StandardScaler() data['train_X'] = ss.fit_transform(data['train_X']) data['test_X'] = ss.transform(data['test_X']) # from sklearn.neighbors import KNeighborsClassifier # clf = KNeighborsClassifier(weights='uniform', n_neighbors=5) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=1, n_estimators=10, n_jobs=1) rfc = RandomForestClassifier(random_state=1, n_jobs=3) #from sklearn.ensemble import GradientBoostingClassifier #clf = GradientBoostingClassifier(n_estimators=10) #from sklearn.ensemble import AdaBoostClassifier #clf = AdaBoostClassifier(rfc, n_estimators=30, random_state=1) #from sklearn.ensemble import ExtraTreesClassifier #clf = ExtraTreesClassifier(n_jobs=3, n_estimators=50, random_state=1) from sklearn.metrics import make_scorer qwkappa = make_scorer(kappa, weights='quadratic') # from sklearn.cross_validation import cross_val_score # scores = cross_val_score(clf, data['train_X'], data['train_y'], cv=10, # scoring=qwkappa, n_jobs=2) # print("Kappa: {:.5f} (+/- {:.5f})".format(scores.mean(), scores.std())) from sklearn.grid_search import GridSearchCV grid = GridSearchCV( estimator=clf, param_grid={ 'n_estimators': [10, 20, 50], 'criterion': ['gini', 'entropy'], #'max_depth': [3, 4, 5, 7, 10] }, cv=10, scoring=qwkappa, n_jobs=2, verbose=2) grid.fit(data['train_X'], data['train_y']) print('grid scores:', grid.grid_scores_) print('best score:', grid.best_score_) print('best params:', grid.best_params_) pass
def work(): from h5pipes import h5open from pypipes import getitem,as_key from nppipes import as_array from skll import kappa data = ( ('raw-data.h5',) | h5open | as_key('file') | as_key('train_X', lambda d: (d['file'],) | getitem('train_X') | as_array | P.first ) | as_key('train_y', lambda d: (d['file'],) | getitem('train_y') | as_array | P.first ) | as_key('test_X', lambda d: (d['file'],) | getitem('test_X') | as_array | P.first ) | as_key('train_labels', lambda d: (d['file'],) | getitem('train_labels') | as_array | P.first ) | as_key('test_labels', lambda d: (d['file'],) | getitem('test_labels') | as_array | P.first ) | P.first ) nominal_cidx = [0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77] from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder(categorical_features=nominal_cidx, sparse=False) data['train_X'] = enc.fit_transform(data['train_X']) data['test_X'] = enc.transform(data['test_X']) from sklearn.preprocessing import StandardScaler ss = StandardScaler() data['train_X'] = ss.fit_transform(data['train_X']) data['test_X'] = ss.transform(data['test_X']) # from sklearn.neighbors import KNeighborsClassifier # clf = KNeighborsClassifier(weights='uniform', n_neighbors=5) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=1, n_estimators=10, n_jobs=1) rfc = RandomForestClassifier(random_state=1, n_jobs=3) #from sklearn.ensemble import GradientBoostingClassifier #clf = GradientBoostingClassifier(n_estimators=10) #from sklearn.ensemble import AdaBoostClassifier #clf = AdaBoostClassifier(rfc, n_estimators=30, random_state=1) #from sklearn.ensemble import ExtraTreesClassifier #clf = ExtraTreesClassifier(n_jobs=3, n_estimators=50, random_state=1) from sklearn.metrics import make_scorer qwkappa = make_scorer(kappa, weights='quadratic') # from sklearn.cross_validation import cross_val_score # scores = cross_val_score(clf, data['train_X'], data['train_y'], cv=10, # scoring=qwkappa, n_jobs=2) # print("Kappa: {:.5f} (+/- {:.5f})".format(scores.mean(), scores.std())) from sklearn.grid_search import GridSearchCV grid = GridSearchCV(estimator=clf, param_grid={'n_estimators': [10, 20, 50], 'criterion': ['gini', 'entropy'], #'max_depth': [3, 4, 5, 7, 10] }, cv=10, scoring=qwkappa, n_jobs=2, verbose=2) grid.fit(data['train_X'], data['train_y']) print('grid scores:', grid.grid_scores_) print('best score:', grid.best_score_) print('best params:', grid.best_params_) pass
def work(in_train_arch, in_test_arch, in_train_csv, in_test_csv, out_h5): from pypipes import unzip,as_key,del_key,getitem,setitem from nppipes import (genfromtxt, place,astype,as_columns,label_encoder,fit_transform, transform,stack ) from nppipes import take as np_take from numpy.core.defchararray import strip from numpy import s_,mean,in1d,putmask from collections import Counter from h5pipes import h5new @P.Pipe def replace_missing_with(iterable, ftor): from numpy import isnan for item in iterable: for i in range(item.shape[1]): mask = isnan(item[:, i]) value = ftor(item[~mask, i]) item[mask, i] = value pass yield item missing_cidx = [11, 14, 16, 28, 33, 34, 35, 36, 37, 46, 51, 60, 68] unseen_nominal_cidx = [2, 12, 38, 69, 74] seen_nominal_cidx = [0, 1, 4, 5, 6, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 75, 76, 77] nominal_cidx = seen_nominal_cidx + unseen_nominal_cidx data = ( in_train_arch | unzip(in_train_csv) | genfromtxt(delimiter=',', dtype=str) | place(lambda d: d == '', 'nan') | as_key('train') | as_key('train_col_names', lambda d: strip(d['train'][0], '"')) | as_key('train_labels', lambda d: d['train'][1:, 0].astype(int)) | as_key('train_X', lambda d: d['train'][1:, 1:-1]) | as_key('train_y', lambda d: d['train'][1:, -1].astype(int)) | del_key('train') | as_key('test', lambda d: in_test_arch | unzip(in_test_csv) | genfromtxt(delimiter=',', dtype=str) | place(lambda d: d == '', 'nan') | P.first ) | as_key('test_col_names', lambda d: strip(d['test'][0], '"')) | as_key('test_labels', lambda d: d['test'][1:, 0].astype(int)) | as_key('test_X', lambda d: d['test'][1:, 1:]) | del_key('test') | as_key('train_X', lambda d: (d['train_X'],) | np_take(missing_cidx, axis=1) | astype(float) | replace_missing_with(mean) | astype(str) | setitem(d['train_X'].copy(), s_[:, missing_cidx]) | P.first ) | as_key('label_encoders', lambda d: len(nominal_cidx) | label_encoder | P.as_tuple ) | as_key('train_X', lambda d: (d['train_X'],) | np_take(nominal_cidx, axis=1) | as_columns | fit_transform(d['label_encoders']) | stack(axis=1) | setitem(d['train_X'].copy(), s_[:, nominal_cidx]) | P.first ) | as_key('test_X', lambda d: (d['test_X'],) | np_take(seen_nominal_cidx, axis=1) | as_columns | transform(d['label_encoders'][:-len(unseen_nominal_cidx)]) | stack(axis=1) | setitem(d['test_X'].copy(), s_[:, seen_nominal_cidx]) | P.first ) | as_key('test_X', lambda d: (d['test_X'],) | np_take(unseen_nominal_cidx, axis=1) | as_key('test_unseen_nominals_features') | as_key('test_unseen_nominals', lambda d2: zip(d2['test_unseen_nominals_features'].T, d['label_encoders'][-len(unseen_nominal_cidx):]) | P.select(lambda t: list(set(t[0]) - set(t[1].classes_))) | P.as_list ) | as_key('train_most_common_nominals', lambda d2: zip(d['train_X'][:, unseen_nominal_cidx].T.astype(int), d['label_encoders'][-len(unseen_nominal_cidx):]) | P.select(lambda t: t[1].inverse_transform(t[0])) | P.select(lambda s: Counter(s).most_common(1)[0][0]) | P.as_list ) | as_key('test_corrected_features', lambda d2: zip(d2['test_unseen_nominals_features'].copy().T, d2['test_unseen_nominals'], d2['train_most_common_nominals']) | P.select(lambda t: putmask(t[0], in1d(t[0], t[1]), t[2]) or t[0].T) | stack(axis=1) | P.first ) | getitem('test_corrected_features') | as_columns | transform(d['label_encoders'][-len(unseen_nominal_cidx):]) | stack(axis=1) | setitem(d['test_X'].copy(), s_[:, unseen_nominal_cidx]) | P.first ) | del_key('label_encoders') | as_key('test_X', lambda d: (d['test_X'],) | np_take(missing_cidx, axis=1) | astype(float) | replace_missing_with(mean) | astype(str) | setitem(d['test_X'].copy(), s_[:, missing_cidx]) | P.first ) | P.first ) #print(data.keys()) ( (out_h5,) | h5new | as_key('train_X', lambda _: data['train_X'].astype(float)) | as_key('train_y', lambda _: data['train_y'].astype(float)) | as_key('test_X', lambda _: data['test_X'].astype(float)) | as_key('train_labels', lambda _: data['train_labels']) | as_key('test_labels', lambda _: data['test_labels']) | P.first ) return