def work(in_h5,
         out_csv_file,
         nest,
         njobs):

    from h5pipes import h5open
    from pypipes import getitem,as_key,del_key
    from nppipes import as_array,fit_transform,transform,fit,predict,savetxt,stack
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import ExtraTreesRegressor


    nominal_cidx = [0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23,
                 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45,
                 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59,
                 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77]

    data = (
        (in_h5,)
        | h5open
        | as_key('file')
        | as_key('train_X', lambda d:
            (d['file'],)
            | getitem('train_X')
            | as_array
            | P.first
            )
        | as_key('train_y', lambda d:
            (d['file'],)
            | getitem('train_y')
            | as_array
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['file'],)
            | getitem('test_X')
            | as_array
            | P.first
            )
        | as_key('train_labels', lambda d:
            (d['file'],)
            | getitem('train_labels')
            | as_array
            | P.first
            )
        | as_key('test_labels', lambda d:
            (d['file'],)
            | getitem('test_labels')
            | as_array
            | P.first
            )

        | as_key('one_hot', lambda _:
            (OneHotEncoder(categorical_features=nominal_cidx, sparse=False),))
        | as_key('train_X', lambda d:
            (d['train_X'].copy(),)
            | fit_transform(d['one_hot'])
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['test_X'].copy(),)
            | transform(d['one_hot'])
            | P.first
            )
        | del_key('one_hot')

        | as_key('std_scaler', lambda _: (StandardScaler(),))
        | as_key('train_X', lambda d:
            (d['train_X'].copy(),)
            | fit_transform(d['std_scaler'])
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['test_X'].copy(),)
            | transform(d['std_scaler'])
            | P.first
            )
        | del_key('std_scaler')

        | as_key('RFReg', lambda d:
            (ExtraTreesRegressor(random_state=1,
                                 n_estimators=nest, n_jobs=njobs,
                                 verbose=1,
                                 max_features=1.0, min_samples_leaf=1.0,
                                 max_depth=50),)
            | fit((d['train_X'],), (d['train_y'],))
            | P.first
            )
        | as_key('y_hat', lambda d:
            (d['test_X'],)
            | predict((d['RFReg'],))
            | P.first
            )
        | del_key('RFReg')

        | P.first
    )

    (
        (data['test_labels'], data['y_hat'])
        | stack(axis=1)
        | savetxt(out_csv_file,
                  delimiter=',',
                  fmt=['%d', '%d'],
                  header='"Id","Response"', comments='')
        | P.first
    )

    return
def work(in_h5, out_csv_file, nest, njobs):

    from h5pipes import h5open
    from pypipes import getitem, as_key, del_key
    from nppipes import (as_array, fit_transform, transform, fit, predict,
                         savetxt, stack, clip)
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import StandardScaler
    from xgboost import XGBRegressor

    nominal_cidx = [
        0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
        27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52,
        53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72,
        73, 74, 75, 76, 77
    ]

    data = (
        (in_h5, )
        | h5open
        | as_key('file')
        | as_key(
            'train_X', lambda d: (d['file'], )
            | getitem('train_X')
            | as_array
            | P.first)
        | as_key(
            'train_y', lambda d: (d['file'], )
            | getitem('train_y')
            | as_array
            | P.first)
        | as_key(
            'test_X', lambda d: (d['file'], )
            | getitem('test_X')
            | as_array
            | P.first)
        | as_key(
            'train_labels', lambda d: (d['file'], )
            | getitem('train_labels')
            | as_array
            | P.first)
        | as_key(
            'test_labels', lambda d: (d['file'], )
            | getitem('test_labels')
            | as_array
            | P.first)
        | as_key(
            'one_hot', lambda _:
            (OneHotEncoder(categorical_features=nominal_cidx, sparse=False), ))
        | as_key(
            'train_X', lambda d: (d['train_X'].copy(), )
            | fit_transform(d['one_hot'])
            | P.first)
        | as_key(
            'test_X', lambda d: (d['test_X'].copy(), )
            | transform(d['one_hot'])
            | P.first)
        | del_key('one_hot')
        | as_key('std_scaler', lambda _: (StandardScaler(), ))
        | as_key(
            'train_X', lambda d: (d['train_X'].copy(), )
            | fit_transform(d['std_scaler'])
            | P.first)
        | as_key(
            'test_X', lambda d: (d['test_X'].copy(), )
            | transform(d['std_scaler'])
            | P.first)
        | del_key('std_scaler')
        | as_key(
            'XGBReg',
            lambda d: (
                XGBRegressor(
                    seed=1,
                    n_estimators=nest,  #n_jobs=njobs,
                    #verbose=1,
                    #max_features=1.0, min_samples_leaf=1.0,
                    max_depth=50), )
            | fit((d['train_X'], ), (d['train_y'], ))
            | P.first)
        | as_key(
            'y_hat', lambda d: (d['test_X'], )
            | predict((d['XGBReg'], ))
            | clip(1, 8)
            | P.first)
        | del_key('XGBReg')
        | P.first)

    ((data['test_labels'], data['y_hat'])
     | stack(axis=1)
     | savetxt(out_csv_file,
               delimiter=',',
               fmt=['%d', '%d'],
               header='"Id","Response"',
               comments='')
     | P.first)

    return
Пример #3
0
def work(in_train_arch,
         in_test_arch,
         in_train_csv,
         in_test_csv,
         out_h5):

    from pypipes import unzip,as_key,del_key,getitem,setitem
    from nppipes import (genfromtxt,
                         place,astype,as_columns,label_encoder,fit_transform,
                         transform,stack
                         )
    from nppipes import take as np_take
    from numpy.core.defchararray import strip
    from numpy import s_,mean,in1d,putmask
    from collections import Counter
    from h5pipes import h5new


    @P.Pipe
    def replace_missing_with(iterable, ftor):
        from numpy import isnan
        for item in iterable:
            for i in range(item.shape[1]):
                mask = isnan(item[:, i])
                value = ftor(item[~mask, i])
                item[mask, i] = value
                pass
            yield item


    missing_cidx = [11, 14, 16, 28, 33, 34, 35, 36, 37, 46, 51, 60, 68]
    unseen_nominal_cidx = [2, 12, 38, 69, 74]
    seen_nominal_cidx = [0, 1, 4, 5, 6, 13, 15, 17, 18, 19, 20, 21, 22, 23,
                 24, 25, 26, 27, 29, 30, 31, 32, 39, 40, 41, 42, 43, 44, 45,
                 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59,
                 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 75, 76, 77]
    nominal_cidx = seen_nominal_cidx + unseen_nominal_cidx


    data = (
        in_train_arch
        | unzip(in_train_csv)
        | genfromtxt(delimiter=',', dtype=str)
        | place(lambda d: d == '', 'nan')
        | as_key('train')
        | as_key('train_col_names', lambda d: strip(d['train'][0], '"'))
        | as_key('train_labels',    lambda d: d['train'][1:, 0].astype(int))
        | as_key('train_X',         lambda d: d['train'][1:, 1:-1])
        | as_key('train_y',         lambda d: d['train'][1:, -1].astype(int))
        | del_key('train')


        | as_key('test', lambda d:
                in_test_arch
                | unzip(in_test_csv)
                | genfromtxt(delimiter=',', dtype=str)
                | place(lambda d: d == '', 'nan')
                | P.first
                )
        | as_key('test_col_names', lambda d: strip(d['test'][0], '"'))
        | as_key('test_labels',    lambda d: d['test'][1:, 0].astype(int))
        | as_key('test_X',         lambda d: d['test'][1:, 1:])
        | del_key('test')

        | as_key('train_X', lambda d:
                (d['train_X'],)
                | np_take(missing_cidx, axis=1)
                | astype(float)

                | replace_missing_with(mean)

                | astype(str)
                | setitem(d['train_X'].copy(), s_[:, missing_cidx])
                | P.first
                )

        | as_key('label_encoders', lambda d:
                len(nominal_cidx)
                | label_encoder
                | P.as_tuple
                )

        | as_key('train_X', lambda d:
                (d['train_X'],)
                | np_take(nominal_cidx, axis=1)
                | as_columns
                | fit_transform(d['label_encoders'])
                | stack(axis=1)
                | setitem(d['train_X'].copy(), s_[:, nominal_cidx])
                | P.first
                )

        | as_key('test_X', lambda d:
                (d['test_X'],)
                | np_take(seen_nominal_cidx, axis=1)
                | as_columns
                | transform(d['label_encoders'][:-len(unseen_nominal_cidx)])
                | stack(axis=1)
                | setitem(d['test_X'].copy(), s_[:, seen_nominal_cidx])
                | P.first
                )

        | as_key('test_X', lambda d:
                (d['test_X'],)
                | np_take(unseen_nominal_cidx, axis=1)
                | as_key('test_unseen_nominals_features')

                | as_key('test_unseen_nominals', lambda d2:
                        zip(d2['test_unseen_nominals_features'].T,
                            d['label_encoders'][-len(unseen_nominal_cidx):])
                        | P.select(lambda t: list(set(t[0]) - set(t[1].classes_)))
                        | P.as_list
                        )

                | as_key('train_most_common_nominals', lambda d2:
                        zip(d['train_X'][:, unseen_nominal_cidx].T.astype(int),
                            d['label_encoders'][-len(unseen_nominal_cidx):])
                        | P.select(lambda t: t[1].inverse_transform(t[0]))
                        | P.select(lambda s: Counter(s).most_common(1)[0][0])
                        | P.as_list
                        )

                | as_key('test_corrected_features', lambda d2:
                        zip(d2['test_unseen_nominals_features'].copy().T,
                            d2['test_unseen_nominals'],
                            d2['train_most_common_nominals'])
                        | P.select(lambda t: putmask(t[0], in1d(t[0], t[1]), t[2]) or t[0].T)
                        | stack(axis=1)
                        | P.first
                        )

                | getitem('test_corrected_features')
                | as_columns
                | transform(d['label_encoders'][-len(unseen_nominal_cidx):])
                | stack(axis=1)
                | setitem(d['test_X'].copy(), s_[:, unseen_nominal_cidx])
                | P.first
                )

        | del_key('label_encoders')

        | as_key('test_X', lambda d:
                (d['test_X'],)
                | np_take(missing_cidx, axis=1)
                | astype(float)

                | replace_missing_with(mean)

                | astype(str)
                | setitem(d['test_X'].copy(), s_[:, missing_cidx])
                | P.first
                )

        | P.first
        )

    #print(data.keys())

    (
        (out_h5,)
        | h5new
        | as_key('train_X',         lambda _: data['train_X'].astype(float))
        | as_key('train_y',         lambda _: data['train_y'].astype(float))
        | as_key('test_X',          lambda _: data['test_X'].astype(float))
        | as_key('train_labels',    lambda _: data['train_labels'])
        | as_key('test_labels',     lambda _: data['test_labels'])
        | P.first
    )

    return