def fit(self, data): if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): # print(' cv_fold: ', cv_fold) [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val) knn = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1) knn.fit(X_train, y_train) eval_metrics = weareval.eval_output(knn.predict(X_val), y_val, tasktype=data.tasktype) cv_eval[cv_fold] = {'model': knn, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics} # retain only best model tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()} bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get) self.knn = cv_eval[bst_fold]['model'] return {'model': self.knn, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val) self.knn = knn = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1) self.knn.fit(X_train, y_train) eval_metrics = weareval.eval_output(self.knn.predict(X_val), y_val, tasktype=data.tasktype) return {'model': self.knn, 'metrics': eval_metrics}
def fit(self, data): if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val) tsf = ComposableTimeSeriesForestRegressor( n_jobs=-1) if data.tasktype=='regression' else ComposableTimeSeriesForestClassifier( n_jobs=-1) tsf.fit(X_train, y_train) eval_metrics = weareval.eval_output(tsf.predict(X_val), y_val, tasktype=data.tasktype) cv_eval[cv_fold] = {'model': tsf, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics} # retain only best model tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()} bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get) self.tsf = cv_eval[bst_fold]['model'] return {'model': self.tsf, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val) self.tsf = ComposableTimeSeriesForestRegressor( n_jobs=-1) if data.tasktype=='regression' else ComposableTimeSeriesForestClassifier( n_jobs=-1) self.tsf.fit(X_train, y_train) eval_metrics = weareval.eval_output(self.tsf.predict(X_val), y_val, tasktype=data.tasktype) return {'model': self.tsf, 'metrics': eval_metrics}
def fit(self, target_name, smote=True, verbose=False): X, y = self.data.to_numpy( dtype=np.float32), self.get_model_y(target_name) # CV splits for i, (train_idx, test_idx) in enumerate(self.splitter.split(X, y, self.grps)): X_train, y_train = X[train_idx], y[train_idx] X_test, y_test = X[test_idx], y[test_idx] if 'cat' in self.voi[target_name]: # SMOTE fpr, tpr = dict(), dict() au_roc, eval_metrics = dict(), dict() for jj in range(y_train.shape[1]): if smote: oversample = SMOTE(k_neighbors=3) # print(f'kk: {kk}\tX_train: {X_train.shape}\ty_train: {y_train.shape}') try: X_train_mod, y_train_mod = oversample.fit_resample( X_train, y_train[:, jj]) except ValueError: print( "\n{}-th class cannot be computed. Too few n_samples. Skipping" .format(jj)) if verbose: print('{} class frequencies:'.format(catvar)) for jjj in range(y_train.shape[1]): print( f"j: {jjj}\t0: {(y_train[:, jjj]==0).sum()}\t1: {(y_train[:, jjj]==1).sum()}" ) continue del oversample else: X_train_mod, y_train_mod = X_train, y_train # model/eval model = KNeighborsClassifier(n_jobs=12) model.fit(X_train_mod, y_train_mod) (fpr[jj], tpr[jj], thresholds) = sklmetrics.roc_curve( y_test[:, jj], model.predict_proba(X_test)[:, 1]) au_roc[jj] = sklmetrics.auc(fpr[jj], tpr[jj]) eval_metrics[jj] = weareval.eval_output( model.predict(X_test), y_test[:, jj], tasktype='regression' if 'cont' in self.voi[target_name] else 'classification') # scores = cross_val_score(lr, X_train_mod, y_train_mod, cv=5, scoring='roc_curve') self.res[f'{target_name}_fold{i}'] = (fpr, tpr, au_roc, eval_metrics ) # (lr, scores) else: model = KNeighborsRegressor(n_jobs=12) model.fit(X_train, y_train) self.res[f'{target_name}_fold{i}'] = weareval.eval_output( model.predict(X_test), y_test, tasktype='regression' if 'cont' in self.voi[target_name] else 'classification')
def eval_test(self, data): X_test, y_test = data.Xy_test if self.downsample_inference: idx = np.random.choice(np.arange(X_test.shape[0]), 20, replace=False) X_test, y_test = X_test[idx], y_test[idx] eval_metrics = weareval.eval_output(self.kNNDTW.predict(X_test), y_test, tasktype=data.tasktype) return eval_metrics
def fit(self, data): if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): print(' starting kfold=', cv_fold) [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] kNNDTW = KNeighborsRegressor( n_jobs=-1, algorithm='ball_tree', weights='distance', metric=self.dtw) if data.tasktype=='regression' else KNeighborsClassifier( n_jobs=-1, algorithm='ball_tree', weights='distance', metric=self.dtw) kNNDTW.fit(X_train, y_train) if self.downsample_inference: idx = np.random.choice(np.arange(X_val.shape[0]), 20, replace=False) X_val, y_val = X_val[idx], y_val[idx] eval_metrics = weareval.eval_output(kNNDTW.predict(X_val), y_val, tasktype=data.tasktype) cv_eval[cv_fold] = {'model': kNNDTW, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics} # retain only best model tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()} bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get) self.kNNDTW = cv_eval[bst_fold]['model'] return {'model': self.kNNDTW, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val self.kNNDTW = kNNDTW = KNeighborsRegressor( n_jobs=-1, algorithm='ball_tree', weights='distance', metric=self.dtw) if data.tasktype=='regression' else KNeighborsClassifier( n_jobs=-1, algorithm='ball_tree', weights='distance', metric=self.dtw) self.kNNDTW.fit(X_train, y_train) if self.downsample_inference: idx = np.random.choice(np.arange(X_val.shape[0]), 20, replace=False) X_val, y_val = X_val[idx], y_val[idx] eval_metrics = weareval.eval_output(self.kNNDTW.predict(X_val), y_val, tasktype=data.tasktype) return {'model': self.kNNDTW, 'metrics': eval_metrics}
def fit(self, data): if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] kNN = KNeighborsRegressor( n_jobs=16 ) if self.tasktype == 'regression' else KNeighborsClassifier( n_jobs=16) kNN.fit(X_train, y_train) eval_metrics = weareval.eval_output(kNN.predict(X_val), y_val, tasktype=self.tasktype) cv_eval[cv_fold] = { 'model': kNN, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if self.tasktype == 'regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics } # retain only best model tmp = { cv_fold: cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys() } bst_fold = min( tmp, key=tmp.get) if self.tasktype == 'regression' else max( tmp, key=tmp.get) self.kNN = cv_eval[bst_fold]['model'] return {'model': self.kNN, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val self.kNN = KNeighborsRegressor( n_jobs=12 ) if self.tasktype == 'regression' else KNeighborsClassifier( n_jobs=12) self.kNN.fit(X_train, y_train) eval_metrics = weareval.eval_output(self.kNN.predict(X_val), y_val, tasktype=self.tasktype) return {'model': self.kNN, 'metrics': eval_metrics}
def fit(self, data): params = { 'boosting_type': 'gbdt', 'verbosity': 0} if data.tasktype == 'regression': params['objective'] = 'regression', else: if len(data.Xy_test[1].shape) > 1: params['objective'] = 'multiclass', else: params['objective'] = 'binary', if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_val, y_val) gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]) eval_metrics = weareval.eval_output(gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype) cv_eval[cv_fold] = {'model': gbm, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics} # retain only best model tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()} bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get) self.gbm = cv_eval[bst_fold]['model'] return {'model': self.gbm, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_val, y_val) self.gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]) eval_metrics = weareval.eval_output(self.gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype) return {'model': self.gbm, 'metrics': eval_metrics}
def eval_test(self, data): X_test, y_test = data.Xy_test eval_metrics = weareval.eval_output(self.kNN.predict(X_test), y_test, tasktype=data.tasktype) return eval_metrics
def eval_test(self, data): X_test, y_test = data.Xy_test X_test = from_2d_array_to_nested(X_test) eval_metrics = weareval.eval_output(self.knn.predict(X_test), y_test, tasktype=data.tasktype) return eval_metrics