예제 #1
0
    def fit(self, data):
        if data.kfold > 1:
            cv_eval = {}
            for k, cv_fold in enumerate(data.Xy_train.keys()):
#                 print('    cv_fold: ', cv_fold)
                [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold]
                X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val)
                knn = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1)
                knn.fit(X_train, y_train)
                eval_metrics = weareval.eval_output(knn.predict(X_val), y_val, tasktype=data.tasktype)
                cv_eval[cv_fold] = {'model': knn, 
                                    # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs?
                                    'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'],
                                    'metrics': eval_metrics}
            # retain only best model
            tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()}
            bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get)
            self.knn = cv_eval[bst_fold]['model']
            return {'model': self.knn, 'metrics': cv_eval[bst_fold]['metrics']}
        else:
            X_train, y_train = data.Xy_train
            X_val, y_val = data.Xy_val
            X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val)
            self.knn = knn = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1)
            self.knn.fit(X_train, y_train)
            eval_metrics = weareval.eval_output(self.knn.predict(X_val), y_val, tasktype=data.tasktype)
            return {'model': self.knn, 'metrics': eval_metrics}
예제 #2
0
 def fit(self, data):
     if data.kfold > 1:
         cv_eval = {}
         for k, cv_fold in enumerate(data.Xy_train.keys()):
             [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold]
             X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val)
             tsf = ComposableTimeSeriesForestRegressor(
                 n_jobs=-1) if data.tasktype=='regression' else ComposableTimeSeriesForestClassifier(
                 n_jobs=-1)
             tsf.fit(X_train, y_train)
             eval_metrics = weareval.eval_output(tsf.predict(X_val), y_val, tasktype=data.tasktype)
             cv_eval[cv_fold] = {'model': tsf, 
                                 # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs?
                                 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'],
                                 'metrics': eval_metrics}
         # retain only best model
         tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()}
         bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get)
         self.tsf = cv_eval[bst_fold]['model']
         return {'model': self.tsf, 'metrics': cv_eval[bst_fold]['metrics']}
     else:
         X_train, y_train = data.Xy_train
         X_val, y_val = data.Xy_val
         X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val)
         self.tsf = ComposableTimeSeriesForestRegressor(
             n_jobs=-1) if data.tasktype=='regression' else ComposableTimeSeriesForestClassifier(
             n_jobs=-1)
         self.tsf.fit(X_train, y_train)
         eval_metrics = weareval.eval_output(self.tsf.predict(X_val), y_val, tasktype=data.tasktype)
         return {'model': self.tsf, 'metrics': eval_metrics}
    def fit(self, target_name, smote=True, verbose=False):
        X, y = self.data.to_numpy(
            dtype=np.float32), self.get_model_y(target_name)
        # CV splits
        for i, (train_idx,
                test_idx) in enumerate(self.splitter.split(X, y, self.grps)):
            X_train, y_train = X[train_idx], y[train_idx]
            X_test, y_test = X[test_idx], y[test_idx]

            if 'cat' in self.voi[target_name]:  # SMOTE
                fpr, tpr = dict(), dict()
                au_roc, eval_metrics = dict(), dict()
                for jj in range(y_train.shape[1]):
                    if smote:
                        oversample = SMOTE(k_neighbors=3)
                        # print(f'kk: {kk}\tX_train: {X_train.shape}\ty_train: {y_train.shape}')
                        try:
                            X_train_mod, y_train_mod = oversample.fit_resample(
                                X_train, y_train[:, jj])
                        except ValueError:
                            print(
                                "\n{}-th class cannot be computed. Too few n_samples. Skipping"
                                .format(jj))
                            if verbose:
                                print('{} class frequencies:'.format(catvar))
                                for jjj in range(y_train.shape[1]):
                                    print(
                                        f"j: {jjj}\t0: {(y_train[:, jjj]==0).sum()}\t1: {(y_train[:, jjj]==1).sum()}"
                                    )
                            continue
                        del oversample
                    else:
                        X_train_mod, y_train_mod = X_train, y_train

                    # model/eval
                    model = KNeighborsClassifier(n_jobs=12)
                    model.fit(X_train_mod, y_train_mod)
                    (fpr[jj], tpr[jj], thresholds) = sklmetrics.roc_curve(
                        y_test[:, jj],
                        model.predict_proba(X_test)[:, 1])
                    au_roc[jj] = sklmetrics.auc(fpr[jj], tpr[jj])
                    eval_metrics[jj] = weareval.eval_output(
                        model.predict(X_test),
                        y_test[:, jj],
                        tasktype='regression' if 'cont'
                        in self.voi[target_name] else 'classification')
                    # scores = cross_val_score(lr, X_train_mod, y_train_mod, cv=5, scoring='roc_curve')
                self.res[f'{target_name}_fold{i}'] = (fpr, tpr, au_roc,
                                                      eval_metrics
                                                      )  # (lr, scores)
            else:
                model = KNeighborsRegressor(n_jobs=12)
                model.fit(X_train, y_train)
                self.res[f'{target_name}_fold{i}'] = weareval.eval_output(
                    model.predict(X_test),
                    y_test,
                    tasktype='regression'
                    if 'cont' in self.voi[target_name] else 'classification')
예제 #4
0
 def eval_test(self, data):
     X_test, y_test = data.Xy_test
     if self.downsample_inference:
         idx = np.random.choice(np.arange(X_test.shape[0]), 20, replace=False)
         X_test, y_test = X_test[idx], y_test[idx]
     eval_metrics = weareval.eval_output(self.kNNDTW.predict(X_test), y_test, tasktype=data.tasktype)
     return eval_metrics
예제 #5
0
 def fit(self, data):
     if data.kfold > 1:
         cv_eval = {}
         for k, cv_fold in enumerate(data.Xy_train.keys()):
             print('    starting kfold=', cv_fold)
             [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold]
             kNNDTW = KNeighborsRegressor(
                 n_jobs=-1, 
                 algorithm='ball_tree',
                 weights='distance',
                 metric=self.dtw) if data.tasktype=='regression' else KNeighborsClassifier(
                 n_jobs=-1, 
                 algorithm='ball_tree',
                 weights='distance',
                 metric=self.dtw)
             kNNDTW.fit(X_train, y_train)
             if self.downsample_inference:
                 idx = np.random.choice(np.arange(X_val.shape[0]), 20, replace=False)
                 X_val, y_val = X_val[idx], y_val[idx]
             eval_metrics = weareval.eval_output(kNNDTW.predict(X_val), y_val, tasktype=data.tasktype)
             cv_eval[cv_fold] = {'model': kNNDTW, 
                                 # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs?
                                 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'],
                                 'metrics': eval_metrics}
         # retain only best model
         tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()}
         bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get)
         self.kNNDTW = cv_eval[bst_fold]['model']
         return {'model': self.kNNDTW, 'metrics': cv_eval[bst_fold]['metrics']}
     else:
         X_train, y_train = data.Xy_train
         X_val, y_val = data.Xy_val
         self.kNNDTW = kNNDTW = KNeighborsRegressor(
             n_jobs=-1, 
             algorithm='ball_tree',
             weights='distance',
             metric=self.dtw) if data.tasktype=='regression' else KNeighborsClassifier(
             n_jobs=-1, 
             algorithm='ball_tree',
             weights='distance',
             metric=self.dtw)
         self.kNNDTW.fit(X_train, y_train)
         if self.downsample_inference:
             idx = np.random.choice(np.arange(X_val.shape[0]), 20, replace=False)
             X_val, y_val = X_val[idx], y_val[idx]
         eval_metrics = weareval.eval_output(self.kNNDTW.predict(X_val), y_val, tasktype=data.tasktype)
         return {'model': self.kNNDTW, 'metrics': eval_metrics}
예제 #6
0
 def fit(self, data):
     if data.kfold > 1:
         cv_eval = {}
         for k, cv_fold in enumerate(data.Xy_train.keys()):
             [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold]
             kNN = KNeighborsRegressor(
                 n_jobs=16
             ) if self.tasktype == 'regression' else KNeighborsClassifier(
                 n_jobs=16)
             kNN.fit(X_train, y_train)
             eval_metrics = weareval.eval_output(kNN.predict(X_val),
                                                 y_val,
                                                 tasktype=self.tasktype)
             cv_eval[cv_fold] = {
                 'model':
                 kNN,
                 # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs?
                 'metric':
                 eval_metrics['mae'] if self.tasktype == 'regression' else
                 eval_metrics['balanced_acc_adj'],
                 'metrics':
                 eval_metrics
             }
         # retain only best model
         tmp = {
             cv_fold: cv_eval[cv_fold]['metric']
             for cv_fold in cv_eval.keys()
         }
         bst_fold = min(
             tmp, key=tmp.get) if self.tasktype == 'regression' else max(
                 tmp, key=tmp.get)
         self.kNN = cv_eval[bst_fold]['model']
         return {'model': self.kNN, 'metrics': cv_eval[bst_fold]['metrics']}
     else:
         X_train, y_train = data.Xy_train
         X_val, y_val = data.Xy_val
         self.kNN = KNeighborsRegressor(
             n_jobs=12
         ) if self.tasktype == 'regression' else KNeighborsClassifier(
             n_jobs=12)
         self.kNN.fit(X_train, y_train)
         eval_metrics = weareval.eval_output(self.kNN.predict(X_val),
                                             y_val,
                                             tasktype=self.tasktype)
         return {'model': self.kNN, 'metrics': eval_metrics}
예제 #7
0
 def fit(self, data):
     params = {
         'boosting_type': 'gbdt',
         'verbosity': 0} 
     if data.tasktype == 'regression':    
         params['objective'] = 'regression',
     else:
         if len(data.Xy_test[1].shape) > 1:
             params['objective'] = 'multiclass',
         else:
             params['objective'] = 'binary',
     if data.kfold > 1:
         cv_eval = {}
         for k, cv_fold in enumerate(data.Xy_train.keys()):
             [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold]
             lgb_train = lgb.Dataset(X_train, y_train)
             lgb_eval = lgb.Dataset(X_val, y_val)
             gbm = lgb.train(params,
                             lgb_train, 
                             valid_sets=lgb_eval,
                             callbacks=[lgb.early_stopping(stopping_rounds=5)])
             eval_metrics = weareval.eval_output(gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype)
             cv_eval[cv_fold] = {'model': gbm, 
                                 # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs?
                                 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'],
                                 'metrics': eval_metrics}
         # retain only best model
         tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()}
         bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get)
         self.gbm = cv_eval[bst_fold]['model']
         return {'model': self.gbm, 'metrics': cv_eval[bst_fold]['metrics']}
     else:
         X_train, y_train = data.Xy_train
         X_val, y_val = data.Xy_val
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_val, y_val)
         self.gbm = lgb.train(params,
                         lgb_train, 
                         valid_sets=lgb_eval,
                         callbacks=[lgb.early_stopping(stopping_rounds=5)])
         eval_metrics = weareval.eval_output(self.gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype)
         return {'model': self.gbm, 'metrics': eval_metrics}
예제 #8
0
 def eval_test(self, data):
     X_test, y_test = data.Xy_test
     eval_metrics = weareval.eval_output(self.kNN.predict(X_test), y_test, tasktype=data.tasktype)
     return eval_metrics
예제 #9
0
 def eval_test(self, data):
     X_test, y_test = data.Xy_test
     X_test = from_2d_array_to_nested(X_test)
     eval_metrics = weareval.eval_output(self.knn.predict(X_test), y_test, tasktype=data.tasktype)
     return eval_metrics