Exemplo n.º 1
0
    def val(self, val_chunk):

        Y = val_chunk['y'].values
        X = val_chunk.drop('y', 1)

        Y_pred = self.predict(X)

        return cal_r(Y, Y_pred)
Exemplo n.º 2
0
    def fit(self, tr_chunk):
        '''
         1. Remove observations whose Y are saturated values
        '''
        #tr_chunk = tr_chunk.loc[~tr_chunk['y'].isin(y_saturated_values)]
        tr_X = tr_chunk.drop(['id', 'y', 'timestamp'], 1)
        tr_Y = tr_chunk['y'].values
        '''
         2. replace NaN with median value
         use SUPER_VALUES_selected_feature_ids to accelerate processing
        '''
        self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
        self.imputer.fit(tr_X)
        tr_X = self.imputer.transform(tr_X)
        '''
         3. normalization
        '''
        self.normalization = preprocessing.StandardScaler().fit(tr_X)
        tr_X = self.normalization.transform(tr_X)
        '''
        4. kbest for over-fitting problem
        '''
        self.kbest = SelectKBest(mutual_info_regression, k=self.kbest_k)
        self.kbest.fit(tr_X, tr_Y)
        tr_X = self.kbest.transform(tr_X)
        '''
        5. Apply regression
        '''
        self.model = GaussianProcessRegressor(alpha=self.alpha,
                                              random_state=self.seed)

        self.model.fit(tr_X, tr_Y)

        tr_Y_pred = self.model.predict(tr_X)

        return cal_r(tr_Y, tr_Y_pred)
Exemplo n.º 3
0
    tr_X_norm = normalization.transform(tr_X_imputed)
    val_X_norm = normalization.transform(val_X_imputed)

    # default 1e-4
    alpha = 1e-4
    for hidden_layer_sizes in range(100, 1000, 100):
        print('hidden_layer_sizes', hidden_layer_sizes, end='-->')
        '''
        activation : {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’
        '''
        model = MLPRegressor(solver='lbfgs', alpha=alpha)

        t0 = time.time()
        model.fit(tr_X_norm, tr_Y)
        t1 = time.time()

        tr_Y_pred = model.predict(tr_X_norm)
        t2 = time.time()

        tr_r = utils.cal_r(tr_Y, tr_Y_pred)
        print('tr:', tr_r, end=',')

        val_Y_pred = model.predict(val_X_norm)
        val_r = utils.cal_r(val_Y, val_Y_pred)
        print('val:', val_r)

        #print('cost: traning',t1-t0,',prediction:',t2-t1)

        alpha /= 10

    break
Exemplo n.º 4
0
    def fit(self, tr_chunk, ts_chunk):

        model_Y = tr_chunk['y'].values
        model_X = tr_chunk.drop(['id', 'y'], 1)

        test_Y = ts_chunk['y'].values
        test_X = ts_chunk.drop(['id', 'y'], 1)
        #%
        '''
         2. replace NaN with median value
        '''
        X = model_X
        self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
        self.imputer.fit(X)
        model_X_imputed = self.imputer.transform(X)

        test_X_imputed = self.imputer.transform(test_X)
        #%
        '''
         3. normalization
        '''

        self.normalization = preprocessing.StandardScaler().fit(
            model_X_imputed)
        model_X_norm = self.normalization.transform(model_X_imputed)

        test_X_norm = self.normalization.transform(test_X_imputed)
        '''
        Apply regression
        '''

        X = model_X_norm
        Y = model_Y

        skf = KFold(n_splits=self.kf_k, shuffle=True, random_state=self.seed)
        kf_i = 0
        self.models = []
        for tr_idx, val_idx in skf.split(X):
            print('KF', kf_i, end=',')
            tr_X, tr_Y = X[tr_idx, :], Y[tr_idx]
            val_X, val_Y = X[val_idx, :], Y[val_idx]

            model = SVR(kernel=self.model_config['kernel'],
                        C=self.model_config['C'],
                        gamma=self.model_config['gamma'],
                        tol=self.model_config['tol'])
            model.fit(tr_X, tr_Y)
            tr_Y_pred = model.predict(tr_X)
            tr_r = utils.cal_r(tr_Y, tr_Y_pred)
            print('tr:', tr_r, end=',')
            val_Y_pred = model.predict(val_X)
            val_r = utils.cal_r(val_Y, val_Y_pred)
            print('val:', val_r, end=',')

            test_Y_pred = model.predict(test_X_norm)
            test_r = utils.cal_r(test_Y, test_Y_pred)
            print('test:', test_r, end='')

            # discard kf model whose val R is too low
            if val_r > -1:
                self.models.append(model)
                print(' (SAVED)')
            else:
                print(' (DISCARD)')
            kf_i += 1

        tr_Y_sum = np.zeros(Y.shape[0])
        test_Y_sum = np.zeros(test_Y.shape[0])
        for model in self.models:
            tr_Y_sum += model.predict(X)
            test_Y_sum += model.predict(test_X_norm)

        tr_Y_pred = tr_Y_sum / len(self.models)
        test_Y_pred = test_Y_sum / len(self.models)

        tr_r = utils.cal_r(Y, tr_Y_pred)
        print('AVERAGE tr:', tr_r, end=',')
        test_r = utils.cal_r(test_Y, test_Y_pred)
        print('test:', test_r)

        return self
Exemplo n.º 5
0
test_chunk = utils.read_variable('output/test_data')

tr_Y = tr_chunk['y']
#%%
'''
Linear Regression
'''


model_linear = ChunkLinearRegression(kf_k=5, seed=13)
model_linear.fit(tr_chunk, test_chunk)

tr_Y_pred = model_linear.predict(tr_chunk)

print('r:',utils.cal_r(tr_chunk['y'],tr_Y_pred))

'''
KF 0,tr: 0.136013944038,val: -607.864299346,test: -51.432700711 (DISCARD)
KF 1,tr: 0.126612607023,val: -42707538498.9,test: -245495431.615 (DISCARD)
KF 2,tr: 0.140904353882,val: -0.37319746263,test: -9.83011647629 (SAVED)
KF 3,tr: 0.132568523411,val: -0.625998947552,test: -43.6620637906 (SAVED)
KF 4,tr: 0.126012579869,val: -0.266122719199,test: -48.915686375 (SAVED)
AVERAGE tr: 0.0448775523981,test: -27.5904174498
r: 0.0448775523981
'''
#%%
'''
Tree Regression
'''
Exemplo n.º 6
0
    def fit(self, tr_chunk, ts_chunk):

        model_Y = tr_chunk['y'].values
        model_X = tr_chunk.drop(['id', 'y'], 1)

        test_Y = ts_chunk['y'].values
        test_X = ts_chunk.drop(['id', 'y'], 1)
        #%
        '''
         2. replace NaN with median value
        '''
        X = model_X[self.selected_feature_ids]
        self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
        self.imputer.fit(X)
        model_X_imputed = self.imputer.transform(X)

        test_X_imputed = self.imputer.transform(
            test_X[self.selected_feature_ids])
        #%
        '''
         3. normalization
        '''

        self.normalization = preprocessing.StandardScaler().fit(
            model_X_imputed)
        model_X_norm = self.normalization.transform(model_X_imputed)

        test_X_norm = self.normalization.transform(test_X_imputed)
        '''
        4. kbest for over-fitting problem
        '''
        tr_X_kbest = model_X_norm
        test_X_kbest = test_X_norm
        '''
        Apply regression
        '''

        X = tr_X_kbest
        Y = model_Y

        skf = KFold(n_splits=self.kf_k, shuffle=True, random_state=self.seed)
        kf_i = 0
        self.models = []
        for tr_idx, val_idx in skf.split(X):
            print('KF', kf_i, end=',')
            tr_X, tr_Y = X[tr_idx, :], Y[tr_idx]
            val_X, val_Y = X[val_idx, :], Y[val_idx]

            model = GaussianProcessRegressor(
                alpha=self.model_config['alpha'],
                random_state=self.model_config['random_state'])
            model.fit(tr_X, tr_Y)
            tr_Y_pred = model.predict(tr_X)
            tr_r = utils.cal_r(tr_Y, tr_Y_pred)
            print('tr:', tr_r, end=',')
            val_Y_pred = model.predict(val_X)
            val_r = utils.cal_r(val_Y, val_Y_pred)
            print('val:', val_r, end=',')

            test_Y_pred = model.predict(test_X_kbest)
            test_r = utils.cal_r(test_Y, test_Y_pred)
            print('test:', test_r, end='')

            # discard kf model whose val R is too low
            if val_r > 0:
                self.models.append(model)
                print(' (SAVED)')
            else:
                print(' (DISCARD)')
            kf_i += 1

        if len(self.models) != 0:
            tr_Y_sum = np.zeros(Y.shape[0])
            test_Y_sum = np.zeros(test_Y.shape[0])
            for model in self.models:
                tr_Y_sum += model.predict(X)
                test_Y_sum += model.predict(test_X_kbest)

            tr_Y_pred = tr_Y_sum / len(self.models)
            test_Y_pred = test_Y_sum / len(self.models)

            tr_r = utils.cal_r(Y, tr_Y_pred)
            print('AVERAGE tr:', tr_r, end=',')
            test_r = utils.cal_r(test_Y, test_Y_pred)
            print('test:', test_r)
        else:
            print('WARNING: no model found.')

        return self
    '''
    training
    '''
    lag_X = np.zeros([len(kf_tr_chunk.y), len(lag_models)])
    for lag_model_i, lag_model in enumerate(lag_models):
        y_pred = lag_model.predict(kf_tr_chunk)
        lag_X[:, lag_model_i] = y_pred
        #print('-->',lag_model_i,utils.cal_r(kf_tr_chunk.y,y_pred))

    model_2L = GaussianProcessRegressor(alpha=SUPER_alpha,
                                        random_state=SUPER_seed)

    model_2L.fit(lag_X, kf_tr_chunk.y)

    tr_Y_pred = model_2L.predict(lag_X)
    tr_r = utils.cal_r(kf_tr_chunk.y, tr_Y_pred)
    print('tr:', tr_r, end=',')
    '''
    Val
    '''
    kf_val_chunk = tr_chunk.iloc[val_idx]
    lag_X = np.zeros([len(kf_val_chunk.y), len(lag_models)])
    for lag_model_i, lag_model in enumerate(lag_models):
        y_pred = lag_model.predict(kf_val_chunk)
        lag_X[:, lag_model_i] = y_pred
    val_Y_pred = model_2L.predict(lag_X)
    val_r = utils.cal_r(kf_val_chunk.y, val_Y_pred)
    print('val:', val_r, end='')

    if val_r > best_model_2L_r:
        best_model_2L = model_2L
    imputer.fit(X)
    model_X_imputed = imputer.transform(X)
    test_X_imputed = imputer.transform(test_X)
    normalization = preprocessing.StandardScaler().fit(model_X_imputed)
    model_X_norm = normalization.transform(model_X_imputed)

    test_X_norm = normalization.transform(test_X_imputed)

    t0 = time.time()
    model.fit(model_X_norm, model_Y)
    t1 = time.time()

    tr_Y_pred = model.predict(model_X_norm)
    t2 = time.time()

    tr_r = utils.cal_r(tr_Y, tr_Y_pred)
    print('tr:', tr_r, end=',')

    test_Y_pred = model.predict(test_X_norm)
    test_r = utils.cal_r(test_Y, test_Y_pred)
    print('test:', test_r)

    print('cost: traning', t1 - t0, ',prediction:', t2 - t1)

    alpha *= 10

#%%
'''
alpha 0.1
tr: 0.995232505044,test: -0.115465678033
alpha 1.0
save Y_Pred of test chunk into files
'''
for model_i in range(100):
    file_path = 'E:/two-sigma/output/chunk_gaussian_kbest/' + str(model_i)

    print('apply model', model_i, 'on chunk 98', end='...')
    t0 = time.time()
    if os.path.isfile(file_path):
        model = utils.read_variable(file_path)
        print('kbest i:', np.argmax(model.kbest.scores_), end='...')
        print('cost', int(time.time() - t0), 'sec')

        # predict testing chunk, and save Y in file
        to = time.time()
        test_Y_pred = model.predict(test_chunk_1L)
        test_r = utils.cal_r(test_chunk_1L_Y, test_Y_pred)
        print('test:', test_r, 'cost', int(time.time() - t0), 'sec')
        utils.save_variable(
            test_Y_pred,
            'E:/two-sigma/output/chunk_98_gaussian_kbest_Y_pred/' +
            str(model_i))
    else:
        print('No Model Found.')

#%%
'''
check model on chunk 0...kbest i: 88...cost 13 sec
test: -0.00879861627212 cost 21 sec
check model on chunk 1...kbest i: 88...cost 40 sec
test: -0.00431009568512 cost 61 sec
check model on chunk 2...kbest i: 88...cost 13 sec