def work(in_h5, out_csv_file, nest, njobs): from h5pipes import h5open from pypipes import getitem, as_key, del_key from nppipes import (as_array, fit_transform, transform, fit, predict, savetxt, stack, clip) from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from xgboost import XGBRegressor nominal_cidx = [ 0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77 ] data = ( (in_h5, ) | h5open | as_key('file') | as_key( 'train_X', lambda d: (d['file'], ) | getitem('train_X') | as_array | P.first) | as_key( 'train_y', lambda d: (d['file'], ) | getitem('train_y') | as_array | P.first) | as_key( 'test_X', lambda d: (d['file'], ) | getitem('test_X') | as_array | P.first) | as_key( 'train_labels', lambda d: (d['file'], ) | getitem('train_labels') | as_array | P.first) | as_key( 'test_labels', lambda d: (d['file'], ) | getitem('test_labels') | as_array | P.first) | as_key( 'one_hot', lambda _: (OneHotEncoder(categorical_features=nominal_cidx, sparse=False), )) | as_key( 'train_X', lambda d: (d['train_X'].copy(), ) | fit_transform(d['one_hot']) | P.first) | as_key( 'test_X', lambda d: (d['test_X'].copy(), ) | transform(d['one_hot']) | P.first) | del_key('one_hot') | as_key('std_scaler', lambda _: (StandardScaler(), )) | as_key( 'train_X', lambda d: (d['train_X'].copy(), ) | fit_transform(d['std_scaler']) | P.first) | as_key( 'test_X', lambda d: (d['test_X'].copy(), ) | transform(d['std_scaler']) | P.first) | del_key('std_scaler') | as_key( 'XGBReg', lambda d: ( XGBRegressor( seed=1, n_estimators=nest, #n_jobs=njobs, #verbose=1, #max_features=1.0, min_samples_leaf=1.0, max_depth=50), ) | fit((d['train_X'], ), (d['train_y'], )) | P.first) | as_key( 'y_hat', lambda d: (d['test_X'], ) | predict((d['XGBReg'], )) | clip(1, 8) | P.first) | del_key('XGBReg') | P.first) ((data['test_labels'], data['y_hat']) | stack(axis=1) | savetxt(out_csv_file, delimiter=',', fmt=['%d', '%d'], header='"Id","Response"', comments='') | P.first) return
def work(in_h5, out_csv_file, nest, njobs): from h5pipes import h5open from pypipes import getitem,as_key,del_key from nppipes import as_array,fit_transform,transform,fit,predict,savetxt,stack from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.ensemble import ExtraTreesRegressor nominal_cidx = [0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77] data = ( (in_h5,) | h5open | as_key('file') | as_key('train_X', lambda d: (d['file'],) | getitem('train_X') | as_array | P.first ) | as_key('train_y', lambda d: (d['file'],) | getitem('train_y') | as_array | P.first ) | as_key('test_X', lambda d: (d['file'],) | getitem('test_X') | as_array | P.first ) | as_key('train_labels', lambda d: (d['file'],) | getitem('train_labels') | as_array | P.first ) | as_key('test_labels', lambda d: (d['file'],) | getitem('test_labels') | as_array | P.first ) | as_key('one_hot', lambda _: (OneHotEncoder(categorical_features=nominal_cidx, sparse=False),)) | as_key('train_X', lambda d: (d['train_X'].copy(),) | fit_transform(d['one_hot']) | P.first ) | as_key('test_X', lambda d: (d['test_X'].copy(),) | transform(d['one_hot']) | P.first ) | del_key('one_hot') | as_key('std_scaler', lambda _: (StandardScaler(),)) | as_key('train_X', lambda d: (d['train_X'].copy(),) | fit_transform(d['std_scaler']) | P.first ) | as_key('test_X', lambda d: (d['test_X'].copy(),) | transform(d['std_scaler']) | P.first ) | del_key('std_scaler') | as_key('RFReg', lambda d: (ExtraTreesRegressor(random_state=1, n_estimators=nest, n_jobs=njobs, verbose=1, max_features=1.0, min_samples_leaf=1.0, max_depth=50),) | fit((d['train_X'],), (d['train_y'],)) | P.first ) | as_key('y_hat', lambda d: (d['test_X'],) | predict((d['RFReg'],)) | P.first ) | del_key('RFReg') | P.first ) ( (data['test_labels'], data['y_hat']) | stack(axis=1) | savetxt(out_csv_file, delimiter=',', fmt=['%d', '%d'], header='"Id","Response"', comments='') | P.first ) return