def QCRSC(x, t, qc, gamma_range, remove_outliers=True, remove_batch=True): TTqc = t[qc == 1] XXqc = x[qc == 1] if remove_outliers == True: q75, q25 = np.percentile(XXqc, [75, 25]) iqr = q75 - q25 min_outlier = q25 - 1.5 * iqr max_outlier = q75 + 1.5 * iqr XXqc[XXqc < min_outlier] = np.nan XXqc[XXqc > max_outlier] = np.nan Xqc = XXqc.dropna() Tqc = TTqc[Xqc.index] mpa = np.median(Xqc) numQC = len(Tqc) dist = [] for i in range(len(TTqc) - 1): dist.append(TTqc.iloc[i + 1] - TTqc.iloc[i] - 1) h = np.median(dist) epsilon = h**3 / 16 if numQC < 5: # QCs < 5 cannot effectively perform QCspline cross-valiadation # setting opt_param to effectively a linear correction. type_fit = 'linear' cvMse = np.empty(len(gamma_range)) cvMse[:] = np.nan gamma = np.max(gamma_range) else: type_fit = 'cubic' loo = LeaveOneOut() cvMse = [] for i in range(len(gamma_range)): p = 1 / (1 + epsilon * 10**(gamma_range[i])) mse = [] for train_index, test_index in loo.split(Xqc): Tqc_train, Tqc_test = Tqc.iloc[train_index], Tqc.iloc[ test_index] Xqc_train, Xqc_test = Xqc.iloc[train_index], Xqc.iloc[ test_index] csaps = CubicSmoothSpline(p=p) csaps.fit(Tqc_train, Xqc_train) Xqc_pred = csaps.predict(Tqc_test.values.tolist()) mse.append(mean_squared_error(Xqc_test, Xqc_pred)) cvMse.append(np.mean(mse)) cvMse = np.array(cvMse) min_cvMse = np.argmin(cvMse) if type_fit == 'cubic': gamma = gamma_range[min_cvMse] p = 1 / (1 + epsilon * 10**(gamma)) try: csaps = CubicSmoothSpline(p=p) csaps.fit(Tqc, Xqc) f = csaps.predict(t.values.tolist()) zz = x - f xx = zz + mpa except ValueError: # Only 1 QC or less if remove_batch == True: f = [np.nan] * len(x) zz = x zz[:] = np.nan xx = zz else: f = [np.nan] * len(x) zz = x xx = zz return xx, f, type_fit, cvMse, gamma, mpa
model_fit = _linear_model.fit(x_train, y_train) test_prediction = _linear_model.predict(x_test) print(model_fit) print(test_prediction) print(x_test.shape, y_test.shape) print(test_prediction.shape, y_test.shape) # plt.scatter(x_test, y_test) # plt.plot(x_test, test_prediction, 'r') print(u'r\u00B2 = {0:.2f}\nMAE = {1:.4f}'.format( model_fit.score(x_test, y_test), median_absolute_error(y_test, test_prediction))) # plt.show() loo = LeaveOneOut() loo.get_n_splits(x_data) print(x_data) x_data MEA = [] R2 = [] for train_index, test_index in loo.split(x_data): # print(train_index) # print(test_index) x_train, x_test = x_data.loc[train_index], x_data.loc[test_index] y_train, y_test = y_data.loc[train_index], y_data.loc[test_index] model_fit = _linear_model.fit(x_train, y_train) test_prediction = _linear_model.predict(x_test) MEA_loo = median_absolute_error(y_test, test_prediction) MEA.append(MEA_loo)
plt.xlabel('Class') plt.ylabel('Frequency') plt.show() df['Label'] = df['Label'].replace(4, 0) #For binary classification X, y = df.iloc[:, :-1], df.iloc[:, -1] y = y.to_numpy() scaler = StandardScaler() scaled_X = scaler.fit_transform(X) models = [('SVM', SVC()), ('DT', DecisionTreeClassifier()), ('NB', GaussianNB()), ('KNN', KNeighborsClassifier())] for name, model in models: y_pred = cross_val_score(model, scaled_X, y, scoring='accuracy', cv=LeaveOneOut(), n_jobs=-1) print(name, '', 'F1 Score:', f1_score(y, y_pred, average='binary'), '', 'Accuracy: %.3f (%.3f)' % (np.mean(y_pred), np.std(y_pred)), '\n') df_cm = pd.DataFrame(confusion_matrix(y, y_pred), range(2), range(2)) sn.set(font_scale=1.4) # for label size sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size plt.show()
def _cost_fn(argd, X, y, EX_list, valid_size, n_folds, shuffle, random_state, use_partial_fit, info, timeout, _conn, loss_fn=None, continuous_loss_fn=False, best_loss=None): '''Calculate the loss function ''' try: t_start = time.time() # Extract info from calling function. if 'classifier' in argd: classifier = argd['classifier'] regressor = argd['regressor'] preprocessings = argd['preprocessing'] ex_pps_list = argd['ex_preprocs'] else: classifier = argd['model']['classifier'] regressor = argd['model']['regressor'] preprocessings = argd['model']['preprocessing'] ex_pps_list = argd['model']['ex_preprocs'] learner = classifier if classifier is not None else regressor is_classif = classifier is not None untrained_learner = copy.deepcopy(learner) # -- N.B. modify argd['preprocessing'] in-place # Determine cross-validation iterator. if n_folds is not None: if n_folds == -1: info('Will use leave-one-out CV') try: cv_iter = LeaveOneOut().split(X) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = LeaveOneOut(len(y)) elif is_classif: info('Will use stratified K-fold CV with K:', n_folds, 'and Shuffle:', shuffle) try: cv_iter = StratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state).split( X, y) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle, random_state=random_state) else: info('Will use K-fold CV with K:', n_folds, 'and Shuffle:', shuffle) try: cv_iter = KFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state).split(X) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = KFold(len(y), n_folds=n_folds, shuffle=shuffle, random_state=random_state) else: if not shuffle: # always choose the last samples. info('Will use the last', valid_size, 'portion of samples for validation') n_train = int(len(y) * (1 - valid_size)) valid_fold = np.ones(len(y), dtype=np.int) valid_fold[:n_train] = -1 # "-1" indicates train fold. try: cv_iter = PredefinedSplit(valid_fold).split() except TypeError: # Older syntax before sklearn version 0.18 cv_iter = PredefinedSplit(valid_fold) elif is_classif: info( 'Will use stratified shuffle-and-split with validation \ portion:', valid_size) try: cv_iter = StratifiedShuffleSplit( 1, test_size=valid_size, random_state=random_state).split(X, y) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = StratifiedShuffleSplit(y, 1, test_size=valid_size, random_state=random_state) else: info('Will use shuffle-and-split with validation portion:', valid_size) try: cv_iter = ShuffleSplit(n_splits=1, test_size=valid_size, random_state=random_state).split(X) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = ShuffleSplit(len(y), 1, test_size=valid_size, random_state=random_state) # Use the above iterator for cross-validation prediction. cv_y_pool = np.array([]) cv_pred_pool = np.array([]) cv_n_iters = np.array([]) for train_index, valid_index in cv_iter: Xfit, Xval = X[train_index], X[valid_index] yfit, yval = y[train_index], y[valid_index] if EX_list is not None: _EX_list = [(EX[train_index], EX[valid_index]) for EX in EX_list] EXfit_list, EXval_list = zip(*_EX_list) else: EXfit_list = None EXval_list = None XEXfit, XEXval = transform_combine_XEX(Xfit, info, preprocessings, Xval, EXfit_list, ex_pps_list, EXval_list) learner = copy.deepcopy(untrained_learner) info('Training learner', learner, 'on X/EX of dimension', XEXfit.shape) if hasattr(learner, "partial_fit") and use_partial_fit: learner, n_iters = pfit_until_convergence(learner, is_classif, XEXfit, yfit, info, best_loss=best_loss, XEXval=XEXval, yval=yval, timeout=timeout, t_start=t_start) else: learner.fit(XEXfit, yfit) n_iters = None if learner is None: break cv_y_pool = np.append(cv_y_pool, yval) info('Scoring on X/EX validation of shape', XEXval.shape) if continuous_loss_fn: cv_pred_pool = np.append(cv_pred_pool, learner.predict_proba(XEXval)) else: cv_pred_pool = np.append(cv_pred_pool, learner.predict(XEXval)) cv_n_iters = np.append(cv_n_iters, n_iters) else: # all CV folds are exhausted. if loss_fn is None: if is_classif: loss = 1 - accuracy_score(cv_y_pool, cv_pred_pool) # -- squared standard error of mean lossvar = (loss * (1 - loss)) / max(1, len(cv_y_pool) - 1) info('OK trial with accuracy %.1f +- %.1f' % (100 * (1 - loss), 100 * np.sqrt(lossvar))) else: loss = 1 - r2_score(cv_y_pool, cv_pred_pool) lossvar = None # variance of R2 is undefined. info('OK trial with R2 score %.2e' % (1 - loss)) else: # Use a user specified loss function loss = loss_fn(cv_y_pool, cv_pred_pool) lossvar = None info('OK trial with loss %.1f' % loss) t_done = time.time() rval = { 'loss': loss, 'loss_variance': lossvar, 'learner': untrained_learner, 'preprocs': preprocessings, 'ex_preprocs': ex_pps_list, 'status': hyperopt.STATUS_OK, 'duration': t_done - t_start, 'iterations': (cv_n_iters.max() if (hasattr(learner, "partial_fit") and use_partial_fit) else None), } rtype = 'return' # The for loop exit with break, one fold did not finish running. if learner is None: t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': 'Not enough time to finish training on \ all CV folds', 'duration': t_done - t_start, } rtype = 'return' ##==== Cost function exception handling ====## except (NonFiniteFeature, ) as exc: print('Failing trial due to NaN in', str(exc)) t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' except (ValueError, ) as exc: if ('k must be less than or equal' ' to the number of training points') in str(exc): t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' else: rval = exc rtype = 'raise' except (AttributeError, ) as exc: print('Failing due to k_means_ weirdness') if "'NoneType' object has no attribute 'copy'" in str(exc): # -- sklearn/cluster/k_means_.py line 270 raises this sometimes t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' else: rval = exc rtype = 'raise' except Exception as exc: rval = exc rtype = 'raise' # -- return the result to calling process _conn.send((rtype, rval))
assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=3, random_state=2) ) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=4, random_state=0) ) cv = cls(n_splits=3) assert compute_n_splits(cv, np_X, np_y, np_groups) == 3 with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == 3 @pytest.mark.parametrize("cvs", [(LeaveOneOut(),), (LeavePOut(2), LeavePOut(3))]) def test_leave_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol with assert_dask_compute(False):
def pca_graph_pvals_less_than(): data = preproccessed_data.join(mapping_file[[ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ]]) X = data.drop([ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ], axis=1) y = data['DiagnosisGroup'] for n_comp in range(2, 30): pcas.append(n_comp) loo = LeaveOneOut() y_pred_list = [] auc = [] auc_train = [] for train_index, test_index in loo.split(X): train_index = list(train_index) # print("%s %s" % (train_index, test_index)) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] most_corelated_taxon = {} for i in range(X_train.shape[1]): p_val = scipy.stats.spearmanr(X_train.iloc[:, i], y_train)[1] if math.isnan(p_val): most_corelated_taxon[X_train.columns[i]] = 1 else: most_corelated_taxon[X_train.columns[i]] = p_val sorted_taxon = sorted(most_corelated_taxon.items(), key=operator.itemgetter(1)) most_corelated_taxon = [i for i in sorted_taxon if i[1] <= 0.01] bact = [i[0] for i in most_corelated_taxon if i[0] != 1] new_data = X[bact] otu_after_pca, _ = apply_pca(new_data, n_components=n_comp) new_data = otu_after_pca.join(data[[ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ]], how='inner') X_new = new_data.drop(['DiagnosisGroup'], axis=1) y_new = new_data['DiagnosisGroup'] regex = re.compile(r"\[|\]|<", re.IGNORECASE) X_new.columns = [ regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_new.columns.values ] X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index] y_train, y_test = y_new[train_index], y_new[test_index] model = XGBClassifier(max_depth=4, n_estimators=150, learning_rate=15 / 100, objective='multi:softmax') #objective='binary:logistic', #scale_pos_weight=(np.sum(y_train == -1) / np.sum(y_train == 1))) model.fit(X_train, y_train) pred_train = model.predict(X_train) auc_train.append(metrics.accuracy_score(y_train, pred_train)) y_pred = model.predict(X_test) y_pred_list.append(y_pred[0]) try: auc = metrics.accuracy_score(y, y_pred_list) except: pass print('PCA components' + str(n_comp), round(auc, 2)) scores = round(auc, 2) scores_train = round(np.array(auc_train).mean(), 2) train_accuracy.append(scores_train) test_accuracy.append(round(scores.mean(), 2))
## 9.6 他の分類手法 ### 9.6.1 K最近傍法 import statsmodels.api as sm from sklearn.model_selection import cross_val_score, LeaveOneOut from sklearn.neighbors import KNeighborsClassifier my_data = sm.datasets.get_rdataset('iris', 'datasets').data X, y = my_data.iloc[:, 0:4], my_data.Species my_scores = cross_val_score(KNeighborsClassifier(), X, y, cv=LeaveOneOut()) my_scores.mean() #> 0.9666666666666667 ### 9.6.2 ニューラルネットワーク import statsmodels.api as sm from sklearn.model_selection import cross_val_score, LeaveOneOut from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler my_data = sm.datasets.get_rdataset('iris', 'datasets').data X, y = my_data.iloc[:, 0:4], my_data.Species my_pipeline = Pipeline([('sc', StandardScaler()), # 標準化 ('mlp', MLPClassifier())]) # ニューラルネットワーク my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(), n_jobs=-1) my_scores.mean() #> 0.9533333333333334
def cross_validation_cc(): ac = loadmat('./data/component_contribution_python.mat') S = ac['train_S'] df_S = pd.DataFrame(ac['train_S']) df_S_unique = df_S.T.drop_duplicates().T unque_cols = df_S_unique.columns.values.tolist() S = S[:, unque_cols] G = ac['G'] # b = ac['b'] b_list = json.load(open('./data/median_b.json')) b = np.asarray(b_list) b = np.reshape(b, (-1, 1)) # w = ac['w'] # pdb.set_trace() m, n = S.shape assert G.shape[0] == m assert b.shape == (n, 1) STG = np.dot(S.T, G) X = STG y = b # reg = LinearRegression(fit_intercept=False).fit(X, y) # y_pred = reg.predict(X) # print('Mean squared error: %.2f' # % mean_squared_error(y, y_pred)) # print('R2',reg.score(X, y)) # # compare dG_gc with matlab # print reg.coef_ # cross validation regression = LinearRegression(fit_intercept=False) # lasso = linear_model.Lasso() scores = -cross_val_score( regression, X, y, cv=LeaveOneOut(), scoring='neg_mean_absolute_error') # print scores # pdb.set_trace() print('median of cv is: ', median(scores)) print('mean of cv is: ', mean(scores)) print('std of cv is: ', scores.std) x = np.sort(scores) # y = np.arange(1,len(x)+1)/len(x) y = 1. * np.arange(len(x)) / (len(x) - 1) fig = plt.figure(figsize=(6, 6)) plt.xlim(right=15) plt.plot(x, y, marker='.', linestyle='none') #,color="#273c75") plt.axhline(y=0.5, linewidth=1, color='grey') plt.xlabel('|$\Delta G^{\'o}_{est} - \Delta G^{\'o}_{obs}$|') plt.ylabel('Cumulative distribution') fig.savefig('./figures/cross_validation_cc.jpg') plt.show()
def IEM_cross_condition_l1out(testing_activity, testing_behaviour, decode_item, WM, WM_t, Inter, tr_st, tr_end): #### #### #### IEM usando data de WM test #### IEM de aquellos TRs donde se use tambien training data (condiciones 1_7 y 2_7) #### En vez de hacer leave one out, que tarda mucho, o usar el mismo data (overfitting), hago k_fold, con 10 splits. #### #### if decode_item == 'Target': dec_I = 'T' elif decode_item == 'Response': dec_I = 'A_R' elif decode_item == 'Distractor': dec_I = 'Dist' else: 'Error specifying the decode item' #### #### Get the Trs with shared information and the TRs without shared information list_wm_scans= range(nscans_wm) trs_shared = range(tr_st, tr_end) nope=[list_wm_scans.remove(tr_s) for tr_s in trs_shared] list_wm_scans2 = list_wm_scans #### #### Run the ones without shared information the same way testing_angles = np.array(testing_behaviour[dec_I]) # A_R # T # Dist ### Respresentation signal_paralel =[ testing_activity[:, i, :] for i in list_wm_scans2 ] Reconstructions = Parallel(n_jobs = numcores)(delayed(Representation)(signal, testing_angles, WM, WM_t, ref_angle=180, plot=False, intercept=Inter) for signal in signal_paralel) #### reconstruction standard (paralel) Reconstruction_indep = pd.concat(Reconstructions, axis=1) #mean of the reconstructions (all trials) Reconstruction_indep.columns = [str(i * TR) for i in list_wm_scans2 ] ##column names #### #### Run the ones with shared information: k fold Recons_dfs_shared=[] for shared_TR in trs_shared: testing_data= testing_activity[:, shared_TR, :] reconstrction_sh=[] loo = LeaveOneOut(); for train_index, test_index in loo.split(testing_data): X_train, X_test = testing_data[train_index], testing_data[test_index] y_train, y_test = testing_angles[train_index], testing_angles[test_index] ## train WM2, Inter2 = Weights_matrix_LM(X_train, y_train) WM_t2 = WM2.transpose() ## test rep_x = Representation(testing_data=X_test, testing_angles=y_test, Weights=WM2, Weights_t=WM_t2, ref_angle=180, plot=False, intercept=Inter2) reconstrction_sh.append(rep_x) ### reconstrction_sh = pd.concat(reconstrction_sh, axis=1) ##una al lado de la otra, de lo mismo, ahora un mean manteniendo indice reconstrction_sh_mean = reconstrction_sh.mean(axis = 1) #solo queda una columna con el mean de cada channel Recons_dfs_shared.append(reconstrction_sh_mean) #### Reconstruction_shared = pd.concat(Recons_dfs_shared, axis=1) Reconstruction_shared.columns = [str(i * TR) for i in trs_shared ] #### #### Merge both recosntructions dfs to get a single one Reconstruction = pd.concat([Reconstruction_indep, Reconstruction_shared], axis=1) ### sort the columns so the indep does not get at the end sorted_col = np.sort([float(Reconstruction.columns[i]) for i in range(len(Reconstruction.columns))]) sorted_col = [str(sorted_col[i]) for i in range(len(sorted_col))] Reconstruction = Reconstruction.reindex( sorted_col, axis=1) # return Reconstruction
def main(): parser = argparse.ArgumentParser(description='PyTorch Cell predict') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--num-epochs', type=int, default=50, help='number of training epochs (default: 50)') parser.add_argument('--model-path', default='../results/', help='path to saved models (default: ../results/)') parser.add_argument('--data-dir', default='../data', help='path to the directory with data set (default: ../data)') parser.add_argument('--experiment', default='exp_1', help='name of the experiment (default: exp_1)') parser.add_argument('--description', default='', help='description of the experiment (default: empty)') args = parser.parse_args() data_dir = args.data_dir model_path = args.model_path experiment = args.experiment description = args.description model_path = os.path.join(model_path, experiment) if not os.path.exists(model_path): os.makedirs(model_path) with open(os.path.join(model_path, 'description.txt'), 'w') as f: f.write("%s\n" % description) use_gpu = torch.cuda.is_available() data_transforms = { 'train': { 0: Compose([ Rotate(15), CenterCrop(224, 224), VerticalFlip(), HorizontalFlip(), HueSaturationValue(hue_shift_limit=50, sat_shift_limit=50, val_shift_limit=40), ToTensor()]), 1:Compose([ Rotate(15), CenterCrop(224, 224), VerticalFlip(), HorizontalFlip(), HueSaturationValue(hue_shift_limit=50, sat_shift_limit=50, val_shift_limit=40), ToTensor()])}, 'val': {0: Compose([ CenterCrop(224, 224), ToTensor() ]), 1: Compose([ CenterCrop(224, 224), ToTensor() ])}, 'test': {0: Compose([ CenterCrop(224, 224), ToTensor() ]), 1:Compose([ CenterCrop(224, 224), ToTensor() ])}, } target_transform = change_classes loo = LeaveOneOut() folds = np.array(['fold_0','fold_1','fold_2']) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for n, (tr, vl) in enumerate((list(loo.split(folds)))): if True: model_ft = pretrainedmodels.__dict__['resnet18'](num_classes=1000, pretrained='imagenet') model_ft.last_linear = nn.Linear(512, 2) if use_gpu: model_ft = model_ft.to(device) criterion = FocalLoss(gamma=0.3, alpha=None, size_average=False) params_to_train = model_ft.parameters() optimizer_ft = optim.Adam(params_to_train) plat_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, 'min', patience=3, factor=0.95, verbose=True) train_folds = folds[tr] val_folds = folds[vl] test_data = ['fold_3'] dataloaders, dataset_sizes, class_names = loader(data_transforms, train_folds, val_folds, test_data, data_dir, bs=args.batch_size, target_transform=target_transform) model_ft, best_score = train_model(model_ft, criterion, optimizer_ft, plat_lr_scheduler, dataset_sizes=dataset_sizes, model_path=model_path, dataloaders=dataloaders, device=device, num_epochs=args.num_epochs, fold_name=folds[vl][0], best='loss') torch.save(model_ft.state_dict(), os.path.join(model_path, 'val_' + folds[vl][0] + '_f1_05_' + str(best_score).replace('.', ''))) del criterion, optimizer_ft, plat_lr_scheduler torch.cuda.empty_cache() gc.collect()
lr = LinearDiscriminantAnalysis() lr.fit(X_train_pca, y_train) y_pred = lr.predict(X_test_pca) print("Accuracy score:{:.2f}".format(metrics.accuracy_score(y_test, y_pred))) cm = metrics.confusion_matrix(y_test, y_pred) plt.subplots(1, figsize=(16, 8)) sns.heatmap(cm) plt.show() print("Classification Results:\n{}".format(metrics.classification_report(y_test, y_pred))) from sklearn.model_selection import LeaveOneOut loo_cv = LeaveOneOut() clf = LogisticRegression() cv_scores = cross_val_score(clf, X_pca, target, cv=loo_cv) print("{} Leave One Out cross-validation mean accuracy score:{:.2f}".format(clf.__class__.__name__, cv_scores.mean())) loo_cv = LeaveOneOut() clf = LinearDiscriminantAnalysis() cv_scores = cross_val_score(clf, X_pca, target, cv=loo_cv) print("{} Leave One Out cross-validation mean accuracy score:{:.2f}".format(clf.__class__.__name__, cv_scores.mean())) from sklearn.model_selection import GridSearchCV params = {'penalty': ['l2'], 'C': np.logspace(0, 4, 10)} clf = LogisticRegression()
def plot_fit( results, phenotype, variable_type="binary", variable_name="phenotype", filename="fit_%s.html" % datetime.now().strftime("%Y%m%d"), flux_type="production", min_coef=0.001, atol=1e-6 ): """Test for differential metabolite production. This will fit the `phenotype` response using L1-regularized linear models with log-fluxes as features. Will use LASSO regression for a continuous response and L1-regularized Logistic regression for a binary response. Parameters ---------- results : micom.workflows.GrowthResults The results returned by the `grow` workflow. phenotype : pandas.Series The data to be fitted. Its index must correspond to `sample_id` in `exchanges`. variable_type : str of ["binary", "continuous"] The type of the variable. variable_name : str A short description of the phenotype for instance "disease_status". filename : str The HTML file where the visualization will be saved. flux_type : str of ["import", "production"] Whether to fit using import or production fluxes. min_coef : float in [0.0, Inf] Only report coefficient that are at least that large. atol : float Tolerance to consider a flux different from zero. Should be roughly equivalent to the solver tolerance. Returns ------- Visualization A MICOM visualization. Can be served with `viz.view`. """ exchanges = results.exchanges anns = results.annotations anns.index = anns.metabolite if flux_type == "import": exchanges = exchanges[ (exchanges.taxon == "medium") & (exchanges.direction == "import") ] exchanges["flux"] = exchanges.flux.abs() else: exchanges = exchanges[ (exchanges.taxon != "medium") & (exchanges.direction == "export") ] exchanges = ( exchanges.groupby(["reaction", "metabolite", "sample_id"]) .apply( lambda df: pd.Series( {"flux": sum(df.abundance * df.flux.abs())} ) ) .reset_index() ) exchanges = exchanges.loc[exchanges.flux > atol] if exchanges.shape[1] < 1: raise ValueError("None of the fluxes passed the tolerance threshold :(") if variable_type == "binary" and phenotype.nunique() != 2: raise ValueError( "Binary variables must have exactly two unique values, yours " "has: %s." % ", ".join(phenotype.unique()) ) elif variable_type == "continuous" and not is_numeric_dtype(phenotype): raise ValueError( "Continuous variables must have a numeric type, but yours is" " of type `%s`." % phenotype.dtype ) elif variable_type not in ["binary", "continuous"]: raise ValueError( "Unsupported variable type. Must be either `binary` or " "`continuous`." ) fluxes = exchanges.pivot_table( index="sample_id", columns="metabolite", values="flux", fill_value=atol ) fluxes = fluxes.applymap(np.log) meta = phenotype[fluxes.index] stds = fluxes.std(axis=1) bad = stds < 1e-6 if bad.any(): logger.warning("Removing %d fluxes due to zero variance." % bad.sum()) fluxes = fluxes.loc[:, ~bad] scaled = StandardScaler().fit_transform(fluxes) if variable_type == "binary": model = LogisticRegressionCV( penalty="l1", scoring="accuracy", solver="liblinear", cv=2, Cs=np.power(10.0, np.arange(-6, 6, 0.5)), max_iter=50000, ) fit = model.fit(scaled, meta) model = LogisticRegression( penalty="l1", solver="liblinear", C=fit.C_[0], max_iter=10000, ) fit = model.fit(scaled, meta) score = cross_val_score(model, X=scaled, y=meta, cv=LeaveOneOut()) coefs = pd.DataFrame( {"coef": fit.coef_[0, :], "metabolite": fluxes.columns} ) else: model = LassoCV(cv=2, max_iter=50000) fit = model.fit(scaled, meta) model = Lasso(alpha=fit.alpha_, max_iter=50000) fit = model.fit(scaled, meta) score = cross_val_score(model, X=scaled, y=meta, cv=3) coefs = pd.DataFrame({"coef": fit.coef_, "metabolite": fluxes.columns}) coefs["description"] = anns.loc[coefs.metabolite, "name"].values score = [np.mean(score), np.std(score)] score.append(model.score(scaled, meta)) if all(coefs.coef.abs() < min_coef): raise RuntimeError( "Unfortunately no metabolite flux was predictive for the " "chosen phenotype and a cutoff of %g :(" % min_coef ) data = {"fluxes": exchanges, "coefficients": coefs} coefs = coefs[coefs.coef.abs() >= min_coef].sort_values(by="coef") predicted = cross_val_predict(model, scaled, meta, cv=LeaveOneOut()) fitted = pd.DataFrame( {"real": meta, "predicted": predicted}, index=meta.index ) exchanges = exchanges.loc[ exchanges.metabolite.isin(coefs.metabolite.values) ] exchanges["meta"] = meta[exchanges.sample_id].values exchanges["description"] = anns.loc[exchanges.metabolite, "name"].values var_type = "nominal" if variable_type == "binary" else "quantitative" viz = Visualization(filename, data, "tests.html") viz.save( fitted=fitted.to_json(orient="records"), coefs=coefs.to_json(orient="records"), exchanges=exchanges.to_json(orient="records"), metabolites=json.dumps(coefs.metabolite.tolist()), variable=variable_name, type=var_type, score=score, width=400, height=300, cheight=max(2 * coefs.shape[0], 40), cwidth=max(8 * coefs.shape[0], 160), ) return viz
def per_voxel_analysis(model, fmri_runs, design_matrices, subject, alpha_list): # compute alphas and test score with cross validation # - fmri_runs: list of fMRI data runs (1 for each run) # - design_matrices: list of design matrices (1 for each run) # - nb_voxels: number of voxels # - indexes: dict specifying row indexes for each run # n_sample = min(max(100 * design_matrices[0].shape[1], design_matrices[0].shape[0]), 8000) n_sample = params.n_sample nb_voxels = fmri_runs[0].shape[1] nb_alphas = len(alpha_list) nb_runs_test = len(fmri_runs) nb_runs_valid = nb_runs_test - 1 alphas_cv2 = np.zeros((nb_runs_test, nb_voxels)) scores_cv2 = np.zeros((nb_runs_test, nb_voxels)) distribution_array = np.zeros((nb_runs_test, n_sample, nb_voxels)) # loop for r2 computation cv3 = 0 logo = LeaveOneOut() # leave on run out ! columns_index = np.arange(design_matrices[0].shape[1]) shuffling = [] for _ in range(n_sample): np.random.shuffle(columns_index) shuffling.append(columns_index) for train_, test in logo.split(fmri_runs): fmri_data_train_ = [ fmri_runs[i] for i in train_ ] # fmri_runs liste 2D colonne = voxels et chaque row = un t_i predictors_train_ = [design_matrices[i] for i in train_] cv2 = 0 logo2 = LeaveOneOut() # leave on run out ! for train, valid in logo2.split(fmri_data_train_): fmri_data_train = [ fmri_data_train_[i] for i in train ] # fmri_runs liste 2D colonne = voxels et chaque row = un t_i predictors_train = [predictors_train_[i] for i in train] dm = np.vstack(predictors_train) fmri = np.vstack(fmri_data_train) scores_cv1 = np.zeros((nb_voxels, nb_runs_valid, nb_alphas)) cv1 = 0 for alpha_tmp in tqdm( alpha_list ): # compute the r2 for a given alpha for all the voxel start = time() model.set_params(alpha=alpha_tmp) model_fitted = model.fit(dm, fmri) # to delete with open( os.path.join( "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/ridge-indiv/english/sub-057/yaml_files", 'fitting_time.txt'), 'a+') as f: f.write( 'alpha = {}- Fitted in {} s on chris station.'.format( alpha_tmp, time() - start)) f.write('\n') # end of to delete r2 = get_r2_score(model_fitted, fmri_data_train_[valid[0]], predictors_train_[valid[0]]) scores_cv1[:, cv2, cv1] = r2 cv1 += 1 cv2 += 1 best_alphas_indexes = np.argmax(np.mean(scores_cv1, axis=1), axis=1) alphas_cv2[cv3, :] = np.array( [alpha_list[i] for i in best_alphas_indexes]) fmri2 = np.vstack(fmri_data_train_) dm2 = np.vstack(predictors_train_) for voxel in tqdm( range(nb_voxels) ): # loop through the voxels and fit the model with the best alpha for this voxel y = fmri2[:, voxel].reshape((fmri2.shape[0], 1)) model.set_params(alpha=alphas_cv2[cv3, voxel]) model_fitted = model.fit(dm2, y) # scores_cv2[cv3, voxel] = get_r2_score(model_fitted, # fmri_runs[test[0]][:,voxel].reshape((fmri_runs[test[0]].shape[0],1)), # design_matrices[test[0]]) r2, distribution = sample_r2( model_fitted, design_matrices[test[0]], fmri_runs[test[0]][:, voxel].reshape( (fmri_runs[test[0]].shape[0], 1)), shuffling=shuffling, n_sample=n_sample, alpha_percentile=params.alpha_percentile, test=True) scores_cv2[cv3, voxel] = r2[0] distribution_array[cv3, :, voxel] = distribution # log the results # log(subject, voxel=voxel, alpha=alphas_cv2[cv3, voxel], r2=scores_cv2[cv3, voxel]) cv3 += 1 return alphas_cv2, scores_cv2, distribution_array # 2D arrays : (nb_runs_test, nb_voxels)
def _plot_kde_vs_gaussian_absolute(x, ks2samp_txt, plname, err_prediction_seconds, manualkdebandwidth=None): # get the kernel bandwidth for the KDE if not manualkdebandwidth: bandwidths = 10**np.linspace(-1, 1, 100) params = {'bandwidth': bandwidths} grid = GridSearchCV(KernelDensity(kernel='gaussian'), params, cv=LeaveOneOut()) grid.fit(x[:, None]) print('ran grid search for best kernel width.') bandwidth = grid.best_params_['bandwidth'] print('got {:.3g}'.format(bandwidth)) else: bandwidth = manualkdebandwidth # instantiate and fit the KDE model kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian') kde.fit(x[:, None]) # score_samples returns the log of the probability density meanerr = np.mean(err_prediction_seconds) x_d = np.linspace(-20 * meanerr, 20 * meanerr, num=1000) logprob = kde.score_samples(x_d[:, None]) fig, ax = plt.subplots(figsize=(6, 4)) ax.fill_between(x_d, np.exp(logprob), alpha=0.5, label='KDE from data') ax.plot(x, np.full_like(x, 0), '|k', markeredgewidth=1, label='data') ax.plot(x_d, norm.pdf(x_d, loc=0, scale=meanerr), label='gaussian, $\mu=0$, $\sigma={:.3g} sec$'.format(meanerr)) sigtxt = ('1$\sigma$ error in prediction: {:.1f} seconds'.format(meanerr)) if not manualkdebandwidth: txt = ( 'leaveoneout x-validated KDE bandwidth: {:.3g}\n{:s}\n{:s}'.format( bandwidth, ks2samp_txt, sigtxt)) else: txt = ('manually selected KDE bandwidth: {:.3g} seconds\n{:s}\n{:s}'. format(bandwidth, ks2samp_txt, sigtxt)) ax.text(0.02, 0.98, txt, transform=ax.transAxes, color='gray', fontsize='xx-small', va='top', ha='left') ax.set_xlabel('Observed - Prediction [seconds]') if plname == 'WASP-18b': loc = 'center left' else: loc = 'best' ax.legend(loc=loc, fontsize='x-small') ax.get_yaxis().set_tick_params(which='both', direction='in') ax.get_xaxis().set_tick_params(which='both', direction='in') fig.tight_layout(h_pad=0, w_pad=0) ax.set_xlim([np.mean(x) - 10 * np.std(x), np.mean(x) + 10 * np.std(x)]) savdir = '../results/verify_tess_timestamps/' savname = '{:s}_kde_vs_gaussian_absolute.png'.format(plname) savpath = os.path.join(savdir, savname) fig.tight_layout() fig.savefig(savpath, bbox_inches='tight', dpi=400) print('saved {:s}'.format(savpath))
def grid_search_loocv_SMOTE(self, X_train, y_train, X_test, y_test, y_test_patients, params, Classifier, oversampling=False, pos_label=1, average='macro'): # kf = KFold(n) loo = LeaveOneOut() # y_pred = list() print('---') best_f1_score = 0.0 best_config = None for configuration in ParameterGrid(params): # myFunction(**configuration) # print('SMOTE config:',configuration) clf = Classifier(**configuration) y_pred = list() # Leave one out for train_indices, test_indices in loo.split(X_train): X_train_curr = X_train[train_indices] X_test_curr = X_train[test_indices] y_train_curr = y_train[train_indices] y_test_curr = y_train[test_indices] ## print('--- SMOTE ---') # print(X_train_curr.shape, y_train_curr.shape) # unique, counts = np.unique(y_train_curr, return_counts=True) # print(np.asarray((unique, counts)).transpose()) sampler = SMOTE(random_state=42) X_train_curr, y_train_curr = sampler.fit_resample( X_train_curr, y_train_curr) # print(X_train_curr.shape, y_train_curr.shape) # unique, counts = np.unique(y_train_curr, return_counts=True) # print(np.asarray((unique, counts)).transpose()) ## print('------') clf.fit(X_train_curr, y_train_curr) y_pred_curr = clf.predict(X_test_curr) y_pred.append(y_pred_curr) y_pred = np.array(y_pred) f1 = f1_score(y_train, y_pred, pos_label=pos_label, average=average) # print('clf f1-weighted', f1) if f1 > best_f1_score: best_f1_score = f1 best_config = configuration print('\nBest configuration found:', best_config) print('With f1-score ' + str(average) + ':', best_f1_score) clf = Classifier(**best_config) sampler = SMOTE(random_state=42) X_train, y_train = sampler.fit_resample(X_train, y_train) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) f1_weighted = f1_score(y_test, y_pred, pos_label=pos_label, average='weighted') p_macro, r_macro, f1_macro, s = precision_recall_fscore_support( y_test, y_pred, average='macro') p_none, r_none, f1_none, s_none = precision_recall_fscore_support( y_test, y_pred, average=None) # print('NUOVE:',p,r,f1) conf_mat = confusion_matrix(y_test, y_pred) print('CONF\n', conf_mat) print('\n') print('\nEVALUATION ON TEST SET:') print('f1-score (weighted) ' + str(average) + ':', f1_weighted) print('accuracy:', accuracy) # --- Compute # of PD patients correctly recalled n_pd_patients_recalled = 0 n_pd_patients = 0 n_healthy_patients_recalled = 0 n_healthy_patients = 0 unique_names = {name for name in y_test_patients} for name in unique_names: if len(y_pred[(y_test_patients == name) & (y_test == 1)]) > 0: curr_mean = np.mean(y_pred[(y_test_patients == name) & (y_test == 1)]) print('PD patient:', curr_mean) if curr_mean > 0.5: n_pd_patients_recalled += 1 n_pd_patients += 1 else: curr_mean = 1 - np.mean( y_pred[(y_test_patients == name) & (y_test == 0)]) print('Healty patient:', curr_mean) if curr_mean > 0.5: n_healthy_patients_recalled += 1 n_healthy_patients += 1 print('# PD patients correctly recalled:', str(n_pd_patients_recalled) + '/' + str(n_pd_patients)) print('# Healthy patients correctly recalled:', str(n_healthy_patients_recalled) + '/' + str(n_healthy_patients)) print('\n----\n----\n') print('CLASSIFIER', self.DO_THIS_CLASSIFIER) print('PCA', self.DO_PCA) print('SMOTE', self.DO_SMOTE) print('accuracy', 'f1_weighted', 'f1_macro', 'precision_macro', 'recall_macro', '#PDPatientsRecalled', '#HealthyPatientsRecalled, p1, r1, p0, r0, f1, f0') print([ round(accuracy, 4), round(f1_weighted, 4), round(f1_macro, 4), round(p_macro, 4), round(r_macro, 4), n_pd_patients_recalled, n_healthy_patients_recalled, round(p_none[1], 4), round(r_none[1], 4), round(p_none[0], 4), round(r_none[0], 4), round(f1_none[1], 4), round(f1_none[0], 2) ]) print('\n----\n----\n') print('\n--------\nEND GRID-SEARCH\n--------\n') return
def IEM_cross_condition_l1out_shuff(testing_activity, testing_behaviour, decode_item, WM, WM_t, Inter, condition, subject, region, iterations, tr_st, tr_end, ref_angle=180): #### #### #### IEM usando data de WM test #### IEM de aquellos TRs donde se use tambien training data (condiciones 1_7 y 2_7) #### En vez de hacer leave one out, que tarda mucho, o usar el mismo data (overfitting), hago k_fold, con 10 splits. #### Pongo el shuffle al principio segun el numero de iterations #### #### if decode_item == 'Target': dec_I = 'T' elif decode_item == 'Response': dec_I = 'A_R' elif decode_item == 'Distractor': dec_I = 'Dist' else: 'Error specifying the decode item' #### #### Get the Trs with shared information and the TRs without shared information list_wm_scans= range(nscans_wm) trs_shared = range(tr_st, tr_end) nope=[list_wm_scans.remove(tr_s) for tr_s in trs_shared] list_wm_scans2 = list_wm_scans #### #### Run the ones without shared information the same way testing_angles = np.array(testing_behaviour[dec_I]) # A_R # T # Dist Reconstructions_shuffled=[] for It in range(iterations): testing_angles_suhff = np.array([random.choice([0, 90, 180, 270]) for i in range(len(testing_angles))]) signal_paralel =[ testing_activity[:, i, :] for i in list_wm_scans2 ] Reconstructions = Parallel(n_jobs = numcores)(delayed(Representation)(signal, testing_angles_suhff, WM, WM_t, ref_angle=180, plot=False, intercept=Inter) for signal in signal_paralel) #### reconstruction standard (paralel) Reconstruction_indep = pd.concat(Reconstructions, axis=1) #mean of the reconstructions (all trials) Reconstruction_indep.columns = [str(i * TR) for i in list_wm_scans2 ] ##column names ### #### Run the ones with shared information: k fold Recons_dfs_shared=[] for shared_TR in trs_shared: testing_data= testing_activity[:, shared_TR, :] reconstrction_sh=[] loo = LeaveOneOut(); for train_index, test_index in loo.split(testing_data): X_train, X_test = testing_data[train_index], testing_data[test_index] y_train, y_test = testing_angles[train_index], testing_angles[test_index] ##aqui no mezclas, ya que antes WM t WM_t no estanba trained en shuffled data ## train WM2, Inter2 = Weights_matrix_LM(X_train, y_train); WM_t2 = WM2.transpose(); ## do the suffle here! y_test = np.array([random.choice([0, 90, 180, 270]) for i in range(len(y_test))]) ## test rep_x = Representation(testing_data=X_test, testing_angles=y_test, Weights=WM2, Weights_t=WM_t2, ref_angle=180, plot=False, intercept=Inter2) reconstrction_sh.append(rep_x) ### reconstrction_sh = pd.concat(reconstrction_sh, axis=1) ##una al lado de la otra, de lo mismo, ahora un mean manteniendo indice reconstrction_sh_mean = reconstrction_sh.mean(axis = 1) #solo queda una columna con el mean de cada channel Recons_dfs_shared.append(reconstrction_sh_mean) #### Reconstruction_shared = pd.concat(Recons_dfs_shared, axis=1) Reconstruction_shared.columns = [str(i * TR) for i in trs_shared ] #### #### Merge both recosntructions dfs to get a single one Reconstruction = pd.concat([Reconstruction_indep, Reconstruction_shared], axis=1) ### sort the columns so the indep does not get at the end sorted_col = np.sort([float(Reconstruction.columns[i]) for i in range(len(Reconstruction.columns))]) sorted_col = [str(sorted_col[i]) for i in range(len(sorted_col))] Reconstruction = Reconstruction.reindex( sorted_col, axis=1) # Reconstructions_shuffled.append(Reconstruction) ## ###### ###### Coger solo lo que te interesa ### Get just the supposed target location df_shuffle=[] for i in range(len(Reconstructions_shuffled)): n = Reconstructions_shuffled[i].iloc[ref_angle*2, :] #around the ref_angle (x2 beacuse now we have 720 instead of 360) n = n.reset_index() n.columns = ['times', 'decoding'] n['decoding'] = [sum(Reconstructions_shuffled[i].iloc[:, ts] * f2(ref_angle)) for ts in range(len(n))] #population vector method (scalar product) n['times']=n['times'].astype(float) n['region'] = region n['subject'] = subject n['condition'] = condition df_shuffle.append(n) #save thhis ## df_shuffle = pd.concat(df_shuffle) #same shape as the decosing of the signal return df_shuffle
def grid_search_loocv(self, X_train, y_train, X_test, y_test, y_test_patients, params, Classifier, oversampling=False, pos_label=1, average='macro', columns_new=[]): best_f1_score = 0.0 best_config = None all_scores = [] for configuration in ParameterGrid(params): print(configuration) clf = Classifier(**configuration) y_pred_val = cross_val_predict(clf, X_train, y_train, cv=LeaveOneOut()) f1 = f1_score(y_train, y_pred_val, pos_label=pos_label, average=average) if f1 > best_f1_score: best_f1_score = f1 best_config = configuration all_scores.append(f1) print('all scores:', all_scores) print('\nBest configuration found:', best_config) print('With f1-score ' + str(average) + ':', best_f1_score) clf = Classifier(**best_config) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) f1_weighted = f1_score(y_test, y_pred, pos_label=pos_label, average='weighted') p_macro, r_macro, f1_macro, s = precision_recall_fscore_support( y_test, y_pred, average='macro') p_none, r_none, f1_none, s_none = precision_recall_fscore_support( y_test, y_pred, average=None) # print('NUOVE:',p,r,f1) conf_mat = confusion_matrix(y_test, y_pred) print('CONF\n', conf_mat) print('\n') print('\nEVALUATION ON TEST SET:') print('f1-score (weighted) ' + str(average) + ':', f1_weighted) print('accuracy:', accuracy) # --- Compute # of PD patients correctly recalled n_pd_patients_recalled = 0 n_pd_patients = 0 n_healthy_patients_recalled = 0 n_healthy_patients = 0 unique_names = {name for name in y_test_patients} for name in unique_names: if len(y_pred[(y_test_patients == name) & (y_test == 1)]) > 0: curr_mean = np.mean(y_pred[(y_test_patients == name) & (y_test == 1)]) print('PD patient:', curr_mean) if curr_mean > 0.5: n_pd_patients_recalled += 1 n_pd_patients += 1 else: curr_mean = 1 - np.mean( y_pred[(y_test_patients == name) & (y_test == 0)]) print('Healty patient:', curr_mean) if curr_mean > 0.5: n_healthy_patients_recalled += 1 n_healthy_patients += 1 print('# PD patients correctly recalled:', str(n_pd_patients_recalled) + '/' + str(n_pd_patients)) print('# Healthy patients correctly recalled:', str(n_healthy_patients_recalled) + '/' + str(n_healthy_patients)) print('\n----\n----\n') print('CLASSIFIER', self.DO_THIS_CLASSIFIER) print('PCA', self.DO_PCA) print('SMOTE', self.DO_SMOTE) print('accuracy', 'f1_weighted', 'f1_macro', 'precision_macro', 'recall_macro', '#PDPatientsRecalled', '#HealthyPatientsRecalled, p1, r1, p0, r0, f1, f0') print([ round(accuracy, 4), round(f1_weighted, 4), round(f1_macro, 4), round(p_macro, 4), round(r_macro, 4), n_pd_patients_recalled, n_healthy_patients_recalled, round(p_none[1], 4), round(r_none[1], 4), round(p_none[0], 4), round(r_none[0], 4), round(f1_none[1], 4), round(f1_none[0], 2) ]) print('\n----\n----\n') # coeff_values = pd.DataFrame({'Coefficient value': clf.coef_[0], 'Features': columns_new}) # coeff_values.sort_values(by=['Coefficient value'], inplace=True, ascending=False) # fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6,4.5)) # ax = sns.barplot("Coefficient value", "Features", data=coeff_values, # palette="Blues_d") # plt.show() print('\n--------\nEND GRID-SEARCH\n--------\n') #print(export_graphviz(clf, feature_names=columns_new, out_file='tree.dot')) # dot_data = export_graphviz(clf, feature_names=columns_new) # graph = graphviz.Source(dot_data) # graph.render("ultimo_tree") return
def main(): # Empty lists for storing masses fit_mass_allbands = [] fit_mass_allbands_err = [] fit_mass_ubriz = [] fit_mass_ubriz_err = [] fit_mass_briz = [] fit_mass_briz_err = [] # List for storing redshifts redshifts = [] for field in ['North', 'South']: # Read in catalog from Lou if 'North' in field: df = pandas.read_pickle(adap_dir + 'GOODS_North_SNeIa_host_phot.pkl') key = 'ID' elif 'South' in field: df = pandas.read_pickle(adap_dir + 'GOODS_South_SNeIa_host_phot.pkl') key = 'Seq' # Loop over all of our objects for i in range(len(df)): # Now read in the fitting results and get our stellar masses galaxy_seq = df[key][i] h5file_allbands = adap_dir + "goodss_param_sfh/all_bands/" + "emcee_" + \ field + "_" + str(galaxy_seq) + ".h5" h5file_ubriz = adap_dir + "goodss_param_sfh/ubriz/" + "emcee_" + \ field + "_" + str(galaxy_seq) + ".h5" h5file_briz = adap_dir + "goodss_param_sfh/briz/" + "emcee_" + \ field + "_" + str(galaxy_seq) + ".h5" result_all, obs, _ = reader.results_from(h5file_allbands, dangerous=False) result_ubriz, obs, _ = reader.results_from(h5file_ubriz, dangerous=False) result_briz, obs, _ = reader.results_from(h5file_briz, dangerous=False) cq_mass_all = get_cq_mass(result_all) cq_mass_ubriz = get_cq_mass(result_ubriz) cq_mass_briz = get_cq_mass(result_briz) # Append ot plotting arrays fit_mass_allbands.append(cq_mass_all[1]) fit_mass_allbands_lowerr = cq_mass_all[1] - cq_mass_all[0] fit_mass_allbands_uperr = cq_mass_all[2] - cq_mass_all[1] fit_mass_allbands_err.append( [fit_mass_allbands_lowerr, fit_mass_allbands_uperr]) fit_mass_ubriz.append(cq_mass_ubriz[1]) fit_mass_ubriz_lowerr = cq_mass_ubriz[1] - cq_mass_ubriz[0] fit_mass_ubriz_uperr = cq_mass_ubriz[2] - cq_mass_ubriz[1] fit_mass_ubriz_err.append( [fit_mass_ubriz_lowerr, fit_mass_ubriz_uperr]) fit_mass_briz.append(cq_mass_briz[1]) fit_mass_briz_lowerr = cq_mass_briz[1] - cq_mass_briz[0] fit_mass_briz_uperr = cq_mass_briz[2] - cq_mass_briz[1] fit_mass_briz_err.append( [fit_mass_briz_lowerr, fit_mass_briz_uperr]) redshifts.append(df['zbest'][i]) # ---------Convert to numpy arrays and reshape fit_mass_allbands = np.array(fit_mass_allbands) fit_mass_allbands_err = np.array(fit_mass_allbands_err) fit_mass_allbands_err = fit_mass_allbands_err.reshape((2, 66)) fit_mass_ubriz = np.array(fit_mass_ubriz) fit_mass_ubriz_err = np.array(fit_mass_ubriz_err) fit_mass_ubriz_err = fit_mass_ubriz_err.reshape((2, 66)) fit_mass_briz = np.array(fit_mass_briz) fit_mass_briz_err = np.array(fit_mass_briz_err) fit_mass_briz_err = fit_mass_briz_err.reshape((2, 66)) # -------- xdata = np.log10(fit_mass_allbands) x_arr = np.arange(5.0, 13.0, 0.01) # ------------------ histogram and KDE fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlabel(r'$\mathrm{log(M_s)}$') ax.set_ylabel(r'$\mathrm{Normalized\ Density}$') from sklearn.neighbors import KernelDensity from sklearn.model_selection import GridSearchCV, LeaveOneOut from scipy.stats import gaussian_kde xdata_for_kde = xdata[:, None] x2 = np.log10(fit_mass_ubriz)[:, None] x3 = np.log10(fit_mass_briz)[:, None] x_arr_for_kde = x_arr[:, None] # ---- get bandwidth estimates bandwidths = 10**np.linspace(-1, 1, 100) grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=LeaveOneOut()) grid.fit(xdata_for_kde) bw1 = grid.best_params_['bandwidth'] grid.fit(x2) bw2 = grid.best_params_['bandwidth'] grid.fit(x3) bw3 = grid.best_params_['bandwidth'] # Now estimate KDEs kde1 = KernelDensity(kernel='gaussian', bandwidth=0.25).fit(xdata_for_kde) log_dens1 = kde1.score_samples(x_arr_for_kde) kde2 = KernelDensity(kernel='gaussian', bandwidth=0.25).fit(x2) log_dens2 = kde2.score_samples(x_arr_for_kde) kde3 = KernelDensity(kernel='gaussian', bandwidth=0.25).fit(x3) log_dens3 = kde3.score_samples(x_arr_for_kde) # Plot KDEs ax.plot(x_arr, np.exp(log_dens1), color='k', lw=2.5, label='UV-Optical-NIR-MIR', zorder=2) ax.plot(x_arr, np.exp(log_dens2), color='mediumblue', lw=1.4, label='ubriz', zorder=1) ax.plot(x_arr, np.exp(log_dens3), color='darkturquoise', lw=1.4, label='briz', zorder=1) # KDEs using Scipy x1_kde = gaussian_kde(xdata) ax.plot(x_arr, x1_kde(x_arr), ls='--', color='k', lw=2.5, zorder=2) x2_kde = gaussian_kde(np.log10(fit_mass_ubriz)) ax.plot(x_arr, x2_kde(x_arr), ls='--', color='mediumblue', lw=1.4, zorder=2) x3_kde = gaussian_kde(np.log10(fit_mass_briz)) ax.plot(x_arr, x3_kde(x_arr), ls='--', color='darkturquoise', lw=1.4, zorder=2) ax.legend(loc=2, fontsize=10, frameon=False) ax.set_xlim(7.5, 12.5) fig.savefig(adap_dir + 'mass_dist.pdf', dpi=300, bbox_inches='tight') # ------------------ make residual figure fig1 = plt.figure() ax1 = fig1.add_subplot(111) ax1.set_xlabel(r'$\mathrm{log(M_{s;\,all\ bands})}$') ax1.set_ylabel( r'$\mathrm{log(M_{s;\,all\ bands}) - log(M_{s;\,(u)briz}) }$') deltamass1 = xdata - np.log10(fit_mass_ubriz) deltamass2 = xdata - np.log10(fit_mass_briz) xdata_err = np.empty((2, 66)) deltamass1_err = np.empty((2, 66)) deltamass2_err = np.empty((2, 66)) for j in range(len(xdata)): xd = fit_mass_allbands[j] xdl = np.abs(np.log10(1 - fit_mass_allbands_err[0, j] / xd)) xdu = np.log10(1 + fit_mass_allbands_err[1, j] / xd) xdata_err[:, j] = [xdl, xdu] val1 = fit_mass_ubriz[j] dm1l = np.abs( np.log10(1 - fit_mass_allbands_err[0, j] / xd) + np.log10(1 + fit_mass_ubriz_err[1, j] / val1)) dm1u = np.log10(1 + fit_mass_allbands_err[1, j] / xd) + np.log10(1 - fit_mass_ubriz_err[0, j] / val1) deltamass1_err[:, j] = [dm1l, dm1u] val2 = fit_mass_briz[j] dm2l = np.abs( np.log10(1 - fit_mass_allbands_err[0, j] / xd) + np.log10(1 + fit_mass_briz_err[1, j] / val2)) dm2u = np.log10(1 + fit_mass_allbands_err[1, j] / xd) + np.log10(1 - fit_mass_briz_err[0, j] / val2) deltamass2_err[:, j] = [dm2l, dm2u] ax1.axhline(y=0.0, ls='--', color='k', zorder=1) dm1_lbl = r'$\mathrm{log(M_{s;\,all}) - log(M_{s;\,ubriz})}$' dm2_lbl = r'$\mathrm{log(M_{s;\,all}) - log(M_{s;\,briz})}$' #ax1.errorbar(xdata, deltamass1, xerr=xdata_err, yerr=deltamass1_err, # fmt='o', ms=2.0, elinewidth=1.0, ecolor='mediumblue', # color='mediumblue', zorder=2, label=dm1_lbl) #ax1.errorbar(xdata, deltamass2, xerr=xdata_err, yerr=deltamass2_err, # fmt='o', ms=2.0, elinewidth=1.0, ecolor='darkturquoise', # color='darkturquoise', zorder=2, label=dm2_lbl) ax1.scatter(xdata, deltamass1, s=12, color='mediumblue', zorder=2, label=dm1_lbl) ax1.scatter(xdata, deltamass2, s=10, color='darkturquoise', zorder=2, label=dm2_lbl) # Fit a line to the points m1, b1 = np.polyfit(xdata, deltamass1, 1) m2, b2 = np.polyfit(xdata, deltamass2, 1) ax1.plot(x_arr, b1 + x_arr * m1, '--', color='mediumblue') ax1.plot(x_arr, b2 + x_arr * m2, '--', color='darkturquoise') print("Errors for the points and the line estimate --") ax1.legend(fontsize=10, frameon=False) ax1.set_xlim(6.8, 12.5) ax1.set_ylim(-1.6, 0.8) ax1.text(x=0.38, y=0.15, s=r'$\mathrm{Slope}\,=\,$' + "{:.2f}".format(m1), verticalalignment='top', horizontalalignment='left', transform=ax.transAxes, color='mediumblue', size=14) ax1.text(x=0.38, y=0.11, s=r'$\mathrm{Slope}\,=\,$' + "{:.2f}".format(m2), verticalalignment='top', horizontalalignment='left', transform=ax.transAxes, color='darkturquoise', size=14) fig1.savefig(adap_dir + 'mass_residuals.pdf', dpi=300, bbox_inches='tight') # -------------- # Histograms of measurement significance allbands_sig = fit_mass_allbands / np.mean(fit_mass_allbands_err, axis=0) ubriz_sig = fit_mass_ubriz / np.mean(fit_mass_ubriz_err, axis=0) briz_sig = fit_mass_briz / np.mean(fit_mass_briz_err, axis=0) fig2 = plt.figure() ax2 = fig2.add_subplot(111) # Resahpe data allbands_sig_kde = allbands_sig[:, None] ubriz_sig_kde = ubriz_sig[:, None] briz_sig_kde = briz_sig[:, None] xsig = np.arange(0.0, 10.0, 0.01) xsig_kde = xsig[:, None] # Estimate optimal bandwidth # I think I can use the same grid of bandwidths as before grid.fit(allbands_sig_kde) bw1 = grid.best_params_['bandwidth'] print("BW1:", bw1) # Now estimate KDEs kde1 = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(allbands_sig_kde) log_dens1 = kde1.score_samples(xsig_kde) kde2 = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(ubriz_sig_kde) log_dens2 = kde2.score_samples(xsig_kde) kde3 = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(briz_sig_kde) log_dens3 = kde3.score_samples(xsig_kde) # Plot KDEs ax2.plot(xsig, np.exp(log_dens1), color='k', lw=2.5, label='UV-Optical-NIR-MIR', zorder=2) ax2.plot(xsig, np.exp(log_dens2), color='mediumblue', lw=1.4, label='ubriz', zorder=1) ax2.plot(xsig, np.exp(log_dens3), color='darkturquoise', lw=1.4, label='briz', zorder=1) plt.show() return None
def cross_validate(cfg, featdata, cv_file=None): """ Perform cross validation """ # Init a classifier selected_classifier = cfg.CLASSIFIER['selected'] if selected_classifier == 'GB': cls = GradientBoostingClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['GB']['learning_rate'], presort='auto', n_estimators=cfg.CLASSIFIER['GB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['GB']['depth'], random_state=cfg.CLASSIFIER['GB']['seed'], max_features='sqrt', verbose=0, warm_start=False) elif selected_classifier == 'XGB': cls = XGBClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'], presort='auto', n_estimators=cfg.CLASSIFIER['XGB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['XGB']['depth'], random_state=cfg.CLASSIFIER['XGB'], max_features='sqrt', verbose=0, warm_start=False) elif selected_classifier == 'RF': cls = RandomForestClassifier( n_estimators=cfg.CLASSIFIER['RF']['trees'], max_features='auto', max_depth=cfg.CLASSIFIER['RF']['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER['RF']['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER['rLDA']['r_coeff']) else: logger.error('Unknown classifier type %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] # Choose CV type ntrials, nsamples, fsize = X_data.shape selected_cv = cfg.CV_PERFORM['selected'] if selected_cv == 'LeaveOneOut': logger.info_green('%d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif selected_cv == 'StratifiedShuffleSplit': logger.info_green( '%d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_PERFORM[selected_cv]['folds'], cfg.CV_PERFORM[selected_cv]['test_ratio'])) if SKLEARN_OLD: cv = StratifiedShuffleSplit( Y_data[:, 0], cfg.CV_PERFORM[selected_cv]['folds'], test_size=cfg.CV_PERFORM[selected_cv]['test_ratio'], random_state=cfg.CV_PERFORM[selected_cv]['seed']) else: cv = StratifiedShuffleSplit( n_splits=cfg.CV_PERFORM[selected_cv]['folds'], test_size=cfg.CV_PERFORM[selected_cv]['test_ratio'], random_state=cfg.CV_PERFORM[selected_cv]['seed']) else: logger.error('%s is not supported yet. Sorry.' % cfg.CV_PERFORM[cfg.CV_PERFORM['selected']]) raise NotImplementedError logger.info('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # Do it! timer_cv = qc.Timer() scores, cm_txt = crossval_epochs(cv, X_data, Y_data, cls, cfg.tdef.by_value, cfg.CV['BALANCE_SAMPLES'], n_jobs=cfg.N_JOBS, ignore_thres=cfg.CV['IGNORE_THRES'], decision_thres=cfg.CV['DECISION_THRES']) t_cv = timer_cv.sec() # Export results txt = 'Cross validation took %d seconds.\n' % t_cv txt += '\n- Class information\n' txt += '%d epochs, %d samples per epoch, %d feature dimension (total %d samples)\n' %\ (ntrials, nsamples, fsize, ntrials * nsamples) for ev in np.unique(Y_data): txt += '%s: %d trials\n' % (cfg.tdef.by_value[ev], len(np.where(Y_data[:, 0] == ev)[0])) if cfg.CV['BALANCE_SAMPLES']: txt += 'The number of samples was balanced using %ssampling.\n' % cfg.BALANCE_SAMPLES.lower( ) txt += '\n- Experiment condition\n' txt += 'Sampling frequency: %.3f Hz\n' % featdata['sfreq'] txt += 'Spatial filter: %s (channels: %s)\n' % (cfg.SP_FILTER, cfg.SP_CHANNELS) txt += 'Spectral filter: %s\n' % cfg.TP_FILTER[cfg.TP_FILTER['selected']] txt += 'Notch filter: %s\n' % cfg.NOTCH_FILTER[ cfg.NOTCH_FILTER['selected']] txt += 'Channels: ' + ','.join( [str(featdata['ch_names'][p]) for p in featdata['picks']]) + '\n' txt += 'PSD range: %.1f - %.1f Hz\n' % (cfg.FEATURES['PSD']['fmin'], cfg.FEATURES['PSD']['fmax']) txt += 'Window step: %.2f msec\n' % ( 1000.0 * cfg.FEATURES['PSD']['wstep'] / featdata['sfreq']) if type(wlen) is list: for i, w in enumerate(wlen): txt += 'Window size: %.1f msec\n' % (w * 1000.0) txt += 'Epoch range: %s sec\n' % (cfg.EPOCH[i]) else: txt += 'Window size: %.1f msec\n' % (cfg.FEATURES['PSD']['wlen'] * 1000.0) txt += 'Epoch range: %s sec\n' % (cfg.EPOCH) txt += 'Decimation factor: %d\n' % cfg.FEATURES['PSD']['decim'] # Compute stats cv_mean, cv_std = np.mean(scores), np.std(scores) txt += '\n- Average CV accuracy over %d epochs (random seed=%s)\n' % ( ntrials, cfg.CV_PERFORM[cfg.CV_PERFORM['selected']]['seed']) if cfg.CV_PERFORM[cfg.CV_PERFORM['selected']] in [ 'LeaveOneOut', 'StratifiedShuffleSplit' ]: txt += "mean %.3f, std: %.3f\n" % (cv_mean, cv_std) txt += 'Classifier: %s, ' % selected_classifier if selected_classifier == 'RF': txt += '%d trees, %s max depth, random state %s\n' % ( cfg.CLASSIFIER['RF']['trees'], cfg.CLASSIFIER['RF']['depth'], cfg.CLASSIFIER['RF']['seed']) elif selected_classifier == 'GB' or selected_classifier == 'XGB': txt += '%d trees, %s max depth, %s learing_rate, random state %s\n' % ( cfg.CLASSIFIER['GB']['trees'], cfg.CLASSIFIER['GB']['depth'], cfg.CLASSIFIER['GB']['learning_rate'], cfg.CLASSIFIER['GB']['seed']) elif selected_classifier == 'rLDA': txt += 'regularization coefficient %.2f\n' % cfg.CLASSIFIER['rLDA'][ 'r_coeff'] if cfg.CV['IGNORE_THRES'] is not None: txt += 'Decision threshold: %.2f\n' % cfg.CV['IGNORE_THRES'] txt += '\n- Confusion Matrix\n' + cm_txt logger.info(txt) # Export to a file if 'export_result' in cfg.CV_PERFORM[selected_cv] and cfg.CV_PERFORM[ selected_cv]['export_result'] is True: if cv_file is None: if cfg.EXPORT_CLS is True: qc.make_dirs('%s/classifier' % cfg.DATA_PATH) fout = open('%s/classifier/cv_result.txt' % cfg.DATA_PATH, 'w') else: fout = open('%s/cv_result.txt' % cfg.DATA_PATH, 'w') else: fout = open(cv_file, 'w') fout.write(txt) fout.close()
@author: likhith """ import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import LeaveOneOut iris = datasets.load_iris() X_new = iris.data[50:, 2:4] #Not Considering setosa #X X = np.array([100, 2]) #array of size X = (X_new - np.min(X_new, axis=0) / np.max(X_new, axis=0) - np.min(X_new, axis=0)) #scaling splitS = LeaveOneOut() #LeaveOut cross validation X1 = splitS.get_n_splits(X) #using Leaveout y = iris.target[50:] Y = [] def mismatchcal(j, y): mismatch = 0 if j > 0.5: p = 1 else: p = 0 if y == p: mismatch += 1 else:
def main(): parser = ArgumentParser() parser.add_argument('metatable', type=str, default='', help='Get training set labels') parser.add_argument('--featurefile', default='./products/feat.npz', type=str, help='Feature file') parser.add_argument('--outdir', type=str, default='./products/', help='Path in which to save the LC data (single file)') parser.add_argument('--train', action='store_true', help='Train classification model') parser.add_argument('--savemodel', action='store_true', help='Save output model, training on full set') parser.add_argument('--add-random', dest='add_random', type=bool, default=False, help='Add random number as feature (for testing)') parser.add_argument('--calc-importance', dest='calc_importance', type=bool, default=False, help='Calculate feature importance') parser.add_argument('--only-raenn', dest='only_raenn', type=bool, default=False, help='Use ony RAENN features') parser.add_argument('--not-raenn', dest='not_raenn', type=bool, default=False, help='Exclude RAENN features') parser.add_argument('--no-int', dest='no_int', type=bool, default=False, help='Exclude integral features (for testing)') parser.add_argument( '--resampling', dest='resampling', type=str, default='KDE', help='Resampling methods. Either KDE or Gauss available') parser.add_argument('--modelfile', dest='modelfile', type=str, default='model', help='Name of model file to save') parser.add_argument('--randomseed', type=int, default=42, help='Name of model file to save') parser.add_argument('--outfile', dest='outfile', type=str, default='superprob', help='Name of probability table file') args = parser.parse_args() sn_dict = {'SLSN': 0, 'SNII': 1, 'SNIIn': 2, 'SNIa': 3, 'SNIbc': 4} if args.train: X, y, names, means, stds, feature_names = prep_data_for_training( args.featurefile, args.metatable) names = np.asarray(names, dtype=str) if args.only_raenn: gind = [ i for i, feat in enumerate(feature_names) if 'raenn' in feat ] X = X[:, gind] feature_names = feature_names[gind] if args.not_raenn: gind = [ i for i, feat in enumerate(feature_names) if 'raenn' not in feat ] X = X[:, gind] feature_names = feature_names[gind] if args.no_int: gind = [ i for i, feat in enumerate(feature_names) if 'int' not in feat ] X = X[:, gind] feature_names = feature_names[gind] if args.add_random: feature_names = np.append(feature_names, 'random') if not args.savemodel: loo = LeaveOneOut() y_pred = np.zeros(len(y)) for train_index, test_index in loo.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if args.resampling == 'Gauss': X_res, y_res = Gauss_resample(X_train, y_train, 500) else: X_res, y_res = KDE_resample(X_train, y_train, 500) new_ind = np.arange(len(y_res), dtype=int) np.random.shuffle(new_ind) X_res = X_res[new_ind] y_res = y_res[new_ind] if args.calc_importance: X_res2, y_res2 = Gauss_resample(X_train, y_train, 500) X_res2 = X_res2[:-40, :] y_res2 = y_res2[:-40] if args.add_random: X_res2, y_res2 = Gauss_resample(X_train, y_train, 500) X_res2 = X_res2[:-40, :] y_res2 = y_res2[:-40] X_res = np.vstack((X_res.T, np.random.randn(len(X_res)))).T X_res2 = np.vstack( (X_res2.T, np.random.randn(len(X_res2)))).T X_test = np.vstack( (X_test.T, np.random.randn(len(X_test)))).T clf = RandomForestClassifier(n_estimators=400, max_depth=None, random_state=args.randomseed, criterion='gini', class_weight='balanced', max_features=None, oob_score=False) clf.fit(X_res, y_res) print(clf.predict_proba(X_test), y_test, names[test_index]) if args.calc_importance: feature_names = np.asarray(feature_names, dtype=str) importances = clf.feature_importances_ indices = importances.argsort()[::-1] print("Feature ranking:") for f in range(X_res.shape[1]): print(feature_names[indices[f]], importances[indices[f]]) plt.ylabel("Feature importances") plt.bar(range(X_res.shape[1]), importances[indices], color="grey", align="center") plt.xticks(np.arange(len(importances)) + 0.5, feature_names[indices], rotation=45, ha='right') plt.show() y_pred[test_index] = np.argmax(clf.predict_proba(X_test)) cnf_matrix = confusion_matrix(y, y_pred) print(cnf_matrix) if args.savemodel: if args.resampling == 'Gauss': X_res, y_res = Gauss_resample(X, y, 500) else: X_res, y_res = KDE_resample(X, y, 500) new_ind = np.arange(len(y_res), dtype=int) np.random.shuffle(new_ind) X_res = X_res[new_ind] y_res = y_res[new_ind] clf = RandomForestClassifier(n_estimators=350, max_depth=None, random_state=args.randomseed, criterion='gini', class_weight='balanced', max_features=None, oob_score=False) clf.fit(X_res, y_res) # save the model to disk if not os.path.exists(args.outdir): os.makedirs(args.outdir) if args.outdir[-1] != '/': args.outdir += '/' pickle.dump([clf, means, stds], open( args.outdir + args.modelfile + '_' + date + '.sav', 'wb')) pickle.dump([clf, means, stds], open(args.outdir + args.modelfile + '.sav', 'wb')) else: info = pickle.load(open(args.modelfile, 'rb')) loaded_model = info[0] means = info[1] stds = info[2] X, names, means, stds, feature_names = prep_data_for_classifying( args.featurefile, means, stds) names = np.asarray(names, dtype=str) probabilities = np.zeros((len(names), len(sn_dict))) for i, name in enumerate(names): probabilities[i] = loaded_model.predict_proba([X[i]])[0] probability_table = QTable(np.vstack((names, probabilities.T)).T, names=['Event Name', *sn_dict], meta={'name': 'SuperRAENN probabilities'}) # save the model to disk if not os.path.exists(args.outdir): os.makedirs(args.outdir) if args.outdir[-1] != '/': args.outdir += '/' ascii.write(probability_table, args.outdir + args.outfile + '.tex', format='latex', overwrite=True)
y_train_drug = train_drug["param_3"].values y_test_drug = test_drug["param_3"].values print("Coefficient_3 ....") print("Sigmoid KernelRidge") param_tested_alphas = [1, 10, 50, 100, 500] param_tested_gamma = [0.00001, 0.0001, 0.01, 0.1] param_tested_coef0 = [0.01, 0.1, 0.5, 1] param_grid = dict(alpha=param_tested_alphas, gamma=param_tested_gamma, coef0=param_tested_coef0) splitter_loo = LeaveOneOut() grid = GridSearchCV(KernelRidge(kernel="sigmoid"), param_grid=param_grid, cv=splitter_loo, scoring="neg_mean_absolute_error") results = pd.DataFrame() results["COSMIC_ID"] = test_df_50["COSMIC_ID"] grid.fit(Xtrain_drug, y_train_drug) # Pick the best parameterds, train again and predict on the test data model = KernelRidge(kernel="sigmoid", alpha=grid.best_params_["alpha"], gamma=grid.best_params_["gamma"], coef0=grid.best_params_["coef0"])
cls(n_splits=3, random_state=0)) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=3, random_state=2)) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=4, random_state=0)) cv = cls(n_splits=3) assert compute_n_splits(cv, np_X, np_y, np_groups) == 3 with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == 3 @pytest.mark.parametrize("cvs", [(LeaveOneOut(), ), (LeavePOut(2), LeavePOut(3))]) def test_leave_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
def fit(self, X, y): """Fit the model using X as training data and y as target values. Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] """ X, y = check_X_y( X, y, enforce_univariate=not self.capabilities["multivariate"], coerce_to_numpy=True, ) # Transpose to work correctly with distance functions X = X.transpose((0, 2, 1)) y = np.asarray(y) check_classification_targets(y) # if internal cv is desired, the relevant flag forces a grid search # to evaluate the possible values, # find the best, and then set this classifier's params to match if self._cv_for_params: grid = GridSearchCV( estimator=KNeighborsTimeSeriesClassifier(distance=self.metric, n_neighbors=1), param_grid=self._param_matrix, cv=LeaveOneOut(), scoring="accuracy", ) grid.fit(X, y) self.distance_params = grid.best_params_["distance_params"] if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: if y.ndim != 1: warnings.warn( "IN TS-KNN: A column-vector y was passed when a 1d array " "was expected. Please change the shape of y to " "(n_samples, ), for example using ravel().", DataConversionWarning, stacklevel=2, ) self.outputs_2d_ = False y = y.reshape((-1, 1)) else: self.outputs_2d_ = True self.classes_ = [] self._y = np.empty(y.shape, dtype=np.int) for k in range(self._y.shape[1]): classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: self.classes_ = self.classes_[0] self._y = self._y.ravel() if hasattr(check_array, "__wrapped__"): temp = check_array.__wrapped__.__code__ check_array.__wrapped__.__code__ = _check_array_ts.__code__ else: temp = check_array.__code__ check_array.__code__ = _check_array_ts.__code__ # this not fx = self._fit(X, self_y) in order to maintain backward # compatibility with scikit learn 0.23, where _fit does not take an arg y fx = self._fit(X) if hasattr(check_array, "__wrapped__"): check_array.__wrapped__.__code__ = temp else: check_array.__code__ = temp self._is_fitted = True return fx
model.fit(X1, y1) y2_model = model.predict(X2) accuracy_score(y2, y2_model) # %% y2_model = model.fit(X1, y1).predict(X2) y1_model = model.fit(X2, y2).predict(X1) accuracy_score(y1, y1_model), accuracy_score(y2, y2_model) # %% cross_val_score(model, X, y, cv=5) # %% scores = cross_val_score(model, X, y, cv=LeaveOneOut()) scores # %% scores.mean() # %% def PolynomialRegression(degree=2, **kwargs): return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs)) # %% def make_data(N, err=1.0, rseed=1):
# kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed) # for index, (train, test) in enumerate(kfold.split(x_data_all, y_data_all)): # x_train = data_util.scale(x_data_all.iloc[train]) # x_test = data_util.scale(x_data_all.iloc[test]) # y_train = y_data_all.iloc[train] # model, history= ann(x_train, y_train) # loss, acc = model.evaluate(x_test, to_categorical(y_data_all.iloc[test])) # y_pred = model.predict(x_test) # predict_result_hold = id_all.iloc[test] # predict_result_hold['label'] = y_data_all.iloc[test] # predict_result_hold['0'] = y_pred[:, 0] # predict_result_hold['1'] = y_pred[:, 1] # predict_result_hold.to_csv(cdu.get_save_path(fName+'_'+str(index)+'.csv'), sep=',', encoding='utf-8') # print(acc, loss) # leave-one-out lst = [] scaled_data = data_util.scale(x_data_all) x_data_all = pd.DataFrame(scaled_data, index=x_data_all.index, columns=x_data_all.columns) for train, test in LeaveOneOut().split(x_data_all): y_train = y_data_all.iloc[train] model, history = ann(x_data_all.iloc[train], y_train) loss, acc = model.evaluate(x_data_all.iloc[test], to_categorical(y_data_all.iloc[test], 2)) y_pred = model.predict(x_data_all.iloc[test]) one_reslut = y_pred[0] lst.append([id_all.iloc[test].values[0][0], y_data_all.iloc[test].values[0][0], one_reslut[0], one_reslut[1]]) predict_result = pd.DataFrame(lst, columns=['id', 'label', '0', '1']) predict_result.to_csv(cdu.get_save_path(fName+'.csv'), sep=',', encoding='utf-8') print('done')
output_train = "{}({}: {}) ".format(output_train, i, data[i]) for i in test: bar[i] = "T" output_test = "{}({}: {}) ".format(output_test, i, data[i]) print("[ {} ]".format(" ".join(bar))) print("Train: {}".format(output_train)) print("Test: {}\n".format(output_test)) P_VAL = 2 data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]]) loocv = LeaveOneOut() lpocv = LeavePOut(p=P_VAL) split_loocv = loocv.split(data) split_lpocv = lpocv.split(data) print("Data:\n{}\n".format(data)) print("Leave-One-Out:\n") print_result(split_loocv) print("Leave-P-Out (where p = {}):\n".format(P_VAL)) print_result(split_lpocv) ''' Data: [[1 2] [3 4] [5 6] [7 8]]
def retrain(hololist=r'trainedmodels/turbulencetraining.xlsx', validate=True, save=True): # retrain SVM model with files in hololist spreadsheet # first column has paths of holograms with "no turbulence" and # second column has "turbulence" # read training data spreadsheet book = xlrd.open_workbook(hololist) sheet = book.sheet_by_index(0) noturb = sheet.col_values(1)[1:] Cnoturb = [0] * len(noturb) turb = sheet.col_values(2)[1:] Cturb = [1] * len(turb) files = noturb + turb classes = Cnoturb + Cturb # extract features from each hologram with VGG19 features = np.zeros((len(files), 1000)) for i, file in enumerate(files): imgpath = holopath(file) # generate path of hologram image file img = image.load_img(imgpath, target_size=(224, 224)) # load hologram image file x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) t = time.time() features[i, :] = featuremodel.predict(x) # perform validation if validate: X = features.copy() y = np.array(classes) loo = LeaveOneOut() # create leave one out validadtion n = 0 results = np.zeros_like(y) yp = np.zeros_like(y) # iterate for every hologram in training data for train_index, test_index in loo.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] pline = make_pipeline(StandardScaler(), SVC(gamma='auto')) pline.fit(X_train, y_train) t = time.time() y_testp = pline.predict(X_test) yp[n] = y_testp if (y_testp == y_test): results[n] = 1 n += 1 # calculate and display performance metrics precision = sum(yp[y == 1]) / sum(yp == 1) recall = sum(yp[y == 1]) / sum(y == 1) accuracy = sum(y == yp) / len(y) print('Precision: ' + str(precision)) print('Recall: ' + str(recall)) print('Accuracy: ' + str(accuracy)) # fit SVM with all data pline = make_pipeline(StandardScaler(), SVC(gamma='auto')) pline.fit(X, y) # save SVM for later use if save: s = pickle.dump(pline, open(r'trainedmodels\turbsvm.p', 'wb')) turbdetector = pline
def balance_tpr(cfg, featdata): """ Find the threshold of class index 0 that yields equal number of true positive samples of each class. Currently only available for binary classes. Params ====== cfg: config module feetdata: feature data computed using compute_features() """ n_jobs = cfg.N_JOBS if n_jobs is None: n_jobs = mp.cpu_count() if n_jobs > 1: logger.info('balance_tpr(): Using %d cores' % n_jobs) pool = mp.Pool(n_jobs) results = [] # Init a classifier selected_classifier = cfg.CLASSIFIER[cfg.CLASSIFIER['selected']] if selected_classifier == 'GB': cls = GradientBoostingClassifier(loss='deviance', learning_rate=cfg.CLASSIFIER['GB']['learning_rate'], n_estimators=cfg.CLASSIFIER['GB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['GB']['depth'], random_state=cfg.CLASSIFIER[selected_classifier]['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'XGB': cls = XGBClassifier(loss='deviance', learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'], n_estimators=cfg.CLASSIFIER['XGB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['XGB']['depth'], random_state=cfg.CLASSIFIER['XGB']['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'RF': cls = RandomForestClassifier(n_estimators=cfg.CLASSIFIER['RF']['trees'], max_features='auto', max_depth=cfg.CLASSIFIER['RF']['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER['RF']['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'NN': cls = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1,1), random_state=666) elif selected_classifier == 'Ada': cls = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=100, random_state=0) elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER['rLDA']) else: logger.error('Unknown classifier type %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.CLASSIFIER['PSD']['wlen'] is None: cfg.CLASSIFIER['PSD']['wlen'] = wlen # Choose CV type ntrials, nsamples, fsize = X_data.shape selected_CV = cfg.CV_PERFORM[cfg.CV_PERFORM['selected']] if cselected_CV == 'LeaveOneOut': logger.info_green('\n%d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif selected_CV == 'StratifiedShuffleSplit': logger.info_green('\n%d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_PERFORM[selected_CV]['folds'], cfg.CV_PERFORM[selected_CV]['test_ratio'])) if SKLEARN_OLD: cv = StratifiedShuffleSplit(Y_data[:, 0], cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed']) else: cv = StratifiedShuffleSplit(n_splits=cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed']) else: logger.error('%s is not supported yet. Sorry.' % selected_CV) raise NotImplementedError logger.info('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # For classifier itself, single core is usually faster cls.n_jobs = 1 Y_preds = [] if SKLEARN_OLD: splits = cv else: splits = cv.split(X_data, Y_data[:, 0]) for cnum, (train, test) in enumerate(splits): X_train = np.concatenate(X_data[train]) X_test = np.concatenate(X_data[test]) Y_train = np.concatenate(Y_data[train]) Y_test = np.concatenate(Y_data[test]) if n_jobs > 1: results.append(pool.apply_async(get_predict_proba, [cls, X_train, Y_train, X_test, Y_test, cnum+1])) else: Y_preds.append(get_predict_proba(cls, X_train, Y_train, X_test, Y_test, cnum+1)) cnum += 1 # Aggregate predictions if n_jobs > 1: pool.close() pool.join() for r in results: Y_preds.append(r.get()) Y_preds = np.concatenate(Y_preds, axis=0) # Find threshold for class index 0 Y_preds = sorted(Y_preds) mid_idx = int(len(Y_preds) / 2) if len(Y_preds) == 1: return 0.5 # should not reach here in normal conditions elif len(Y_preds) % 2 == 0: thres = Y_preds[mid_idx-1] + (Y_preds[mid_idx] - Y_preds[mid_idx-1]) / 2 else: thres = Y_preds[mid_idx] return thres
def __init__(self): super(CrossValidationLeaveOneOut, self).__init__() self.__cv = LeaveOneOut()