Exemplo n.º 1
0
def QCRSC(x, t, qc, gamma_range, remove_outliers=True, remove_batch=True):

    TTqc = t[qc == 1]
    XXqc = x[qc == 1]

    if remove_outliers == True:
        q75, q25 = np.percentile(XXqc, [75, 25])
        iqr = q75 - q25
        min_outlier = q25 - 1.5 * iqr
        max_outlier = q75 + 1.5 * iqr
        XXqc[XXqc < min_outlier] = np.nan
        XXqc[XXqc > max_outlier] = np.nan

    Xqc = XXqc.dropna()
    Tqc = TTqc[Xqc.index]

    mpa = np.median(Xqc)
    numQC = len(Tqc)
    dist = []
    for i in range(len(TTqc) - 1):
        dist.append(TTqc.iloc[i + 1] - TTqc.iloc[i] - 1)

    h = np.median(dist)
    epsilon = h**3 / 16

    if numQC < 5:
        # QCs < 5 cannot effectively perform QCspline cross-valiadation
        # setting opt_param to effectively a linear correction.
        type_fit = 'linear'

        cvMse = np.empty(len(gamma_range))
        cvMse[:] = np.nan
        gamma = np.max(gamma_range)
    else:
        type_fit = 'cubic'

        loo = LeaveOneOut()
        cvMse = []
        for i in range(len(gamma_range)):
            p = 1 / (1 + epsilon * 10**(gamma_range[i]))
            mse = []
            for train_index, test_index in loo.split(Xqc):
                Tqc_train, Tqc_test = Tqc.iloc[train_index], Tqc.iloc[
                    test_index]
                Xqc_train, Xqc_test = Xqc.iloc[train_index], Xqc.iloc[
                    test_index]
                csaps = CubicSmoothSpline(p=p)
                csaps.fit(Tqc_train, Xqc_train)
                Xqc_pred = csaps.predict(Tqc_test.values.tolist())
                mse.append(mean_squared_error(Xqc_test, Xqc_pred))
            cvMse.append(np.mean(mse))

    cvMse = np.array(cvMse)
    min_cvMse = np.argmin(cvMse)

    if type_fit == 'cubic':
        gamma = gamma_range[min_cvMse]

    p = 1 / (1 + epsilon * 10**(gamma))

    try:
        csaps = CubicSmoothSpline(p=p)
        csaps.fit(Tqc, Xqc)
        f = csaps.predict(t.values.tolist())
        zz = x - f
        xx = zz + mpa
    except ValueError:
        # Only 1 QC or less
        if remove_batch == True:
            f = [np.nan] * len(x)
            zz = x
            zz[:] = np.nan
            xx = zz
        else:
            f = [np.nan] * len(x)
            zz = x
            xx = zz

    return xx, f, type_fit, cvMse, gamma, mpa
Exemplo n.º 2
0
model_fit = _linear_model.fit(x_train, y_train)
test_prediction = _linear_model.predict(x_test)

print(model_fit)
print(test_prediction)
print(x_test.shape, y_test.shape)
print(test_prediction.shape, y_test.shape)
# plt.scatter(x_test, y_test)
# plt.plot(x_test, test_prediction, 'r')
print(u'r\u00B2 = {0:.2f}\nMAE = {1:.4f}'.format(
    model_fit.score(x_test, y_test),
    median_absolute_error(y_test, test_prediction)))
# plt.show()

loo = LeaveOneOut()
loo.get_n_splits(x_data)

print(x_data)
x_data
MEA = []
R2 = []
for train_index, test_index in loo.split(x_data):
    # print(train_index)
    # print(test_index)
    x_train, x_test = x_data.loc[train_index], x_data.loc[test_index]
    y_train, y_test = y_data.loc[train_index], y_data.loc[test_index]
    model_fit = _linear_model.fit(x_train, y_train)
    test_prediction = _linear_model.predict(x_test)
    MEA_loo = median_absolute_error(y_test, test_prediction)
    MEA.append(MEA_loo)
Exemplo n.º 3
0
    plt.xlabel('Class')
    plt.ylabel('Frequency')
    plt.show()

    df['Label'] = df['Label'].replace(4, 0)  #For binary classification

    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    y = y.to_numpy()

    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)

    models = [('SVM', SVC()), ('DT', DecisionTreeClassifier()),
              ('NB', GaussianNB()), ('KNN', KNeighborsClassifier())]

    for name, model in models:
        y_pred = cross_val_score(model,
                                 scaled_X,
                                 y,
                                 scoring='accuracy',
                                 cv=LeaveOneOut(),
                                 n_jobs=-1)

        print(name, '', 'F1 Score:', f1_score(y, y_pred, average='binary'), '',
              'Accuracy: %.3f (%.3f)' % (np.mean(y_pred), np.std(y_pred)),
              '\n')

        df_cm = pd.DataFrame(confusion_matrix(y, y_pred), range(2), range(2))
        sn.set(font_scale=1.4)  # for label size
        sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})  # font size
        plt.show()
Exemplo n.º 4
0
def _cost_fn(argd,
             X,
             y,
             EX_list,
             valid_size,
             n_folds,
             shuffle,
             random_state,
             use_partial_fit,
             info,
             timeout,
             _conn,
             loss_fn=None,
             continuous_loss_fn=False,
             best_loss=None):
    '''Calculate the loss function
    '''
    try:
        t_start = time.time()
        # Extract info from calling function.
        if 'classifier' in argd:
            classifier = argd['classifier']
            regressor = argd['regressor']
            preprocessings = argd['preprocessing']
            ex_pps_list = argd['ex_preprocs']
        else:
            classifier = argd['model']['classifier']
            regressor = argd['model']['regressor']
            preprocessings = argd['model']['preprocessing']
            ex_pps_list = argd['model']['ex_preprocs']
        learner = classifier if classifier is not None else regressor
        is_classif = classifier is not None
        untrained_learner = copy.deepcopy(learner)
        # -- N.B. modify argd['preprocessing'] in-place

        # Determine cross-validation iterator.
        if n_folds is not None:
            if n_folds == -1:
                info('Will use leave-one-out CV')
                try:
                    cv_iter = LeaveOneOut().split(X)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = LeaveOneOut(len(y))
            elif is_classif:
                info('Will use stratified K-fold CV with K:', n_folds,
                     'and Shuffle:', shuffle)
                try:
                    cv_iter = StratifiedKFold(n_splits=n_folds,
                                              shuffle=shuffle,
                                              random_state=random_state).split(
                                                  X, y)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = StratifiedKFold(y,
                                              n_folds=n_folds,
                                              shuffle=shuffle,
                                              random_state=random_state)
            else:
                info('Will use K-fold CV with K:', n_folds, 'and Shuffle:',
                     shuffle)
                try:
                    cv_iter = KFold(n_splits=n_folds,
                                    shuffle=shuffle,
                                    random_state=random_state).split(X)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = KFold(len(y),
                                    n_folds=n_folds,
                                    shuffle=shuffle,
                                    random_state=random_state)
        else:
            if not shuffle:  # always choose the last samples.
                info('Will use the last', valid_size,
                     'portion of samples for validation')
                n_train = int(len(y) * (1 - valid_size))
                valid_fold = np.ones(len(y), dtype=np.int)
                valid_fold[:n_train] = -1  # "-1" indicates train fold.
                try:
                    cv_iter = PredefinedSplit(valid_fold).split()
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = PredefinedSplit(valid_fold)
            elif is_classif:
                info(
                    'Will use stratified shuffle-and-split with validation \
                      portion:', valid_size)
                try:
                    cv_iter = StratifiedShuffleSplit(
                        1, test_size=valid_size,
                        random_state=random_state).split(X, y)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = StratifiedShuffleSplit(y,
                                                     1,
                                                     test_size=valid_size,
                                                     random_state=random_state)
            else:
                info('Will use shuffle-and-split with validation portion:',
                     valid_size)
                try:
                    cv_iter = ShuffleSplit(n_splits=1,
                                           test_size=valid_size,
                                           random_state=random_state).split(X)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = ShuffleSplit(len(y),
                                           1,
                                           test_size=valid_size,
                                           random_state=random_state)

        # Use the above iterator for cross-validation prediction.
        cv_y_pool = np.array([])
        cv_pred_pool = np.array([])
        cv_n_iters = np.array([])
        for train_index, valid_index in cv_iter:
            Xfit, Xval = X[train_index], X[valid_index]
            yfit, yval = y[train_index], y[valid_index]
            if EX_list is not None:
                _EX_list = [(EX[train_index], EX[valid_index])
                            for EX in EX_list]
                EXfit_list, EXval_list = zip(*_EX_list)
            else:
                EXfit_list = None
                EXval_list = None
            XEXfit, XEXval = transform_combine_XEX(Xfit, info, preprocessings,
                                                   Xval, EXfit_list,
                                                   ex_pps_list, EXval_list)
            learner = copy.deepcopy(untrained_learner)
            info('Training learner', learner, 'on X/EX of dimension',
                 XEXfit.shape)
            if hasattr(learner, "partial_fit") and use_partial_fit:
                learner, n_iters = pfit_until_convergence(learner,
                                                          is_classif,
                                                          XEXfit,
                                                          yfit,
                                                          info,
                                                          best_loss=best_loss,
                                                          XEXval=XEXval,
                                                          yval=yval,
                                                          timeout=timeout,
                                                          t_start=t_start)
            else:
                learner.fit(XEXfit, yfit)
                n_iters = None
            if learner is None:
                break
            cv_y_pool = np.append(cv_y_pool, yval)
            info('Scoring on X/EX validation of shape', XEXval.shape)
            if continuous_loss_fn:
                cv_pred_pool = np.append(cv_pred_pool,
                                         learner.predict_proba(XEXval))
            else:
                cv_pred_pool = np.append(cv_pred_pool, learner.predict(XEXval))
            cv_n_iters = np.append(cv_n_iters, n_iters)
        else:  # all CV folds are exhausted.
            if loss_fn is None:
                if is_classif:
                    loss = 1 - accuracy_score(cv_y_pool, cv_pred_pool)
                    # -- squared standard error of mean
                    lossvar = (loss * (1 - loss)) / max(1, len(cv_y_pool) - 1)
                    info('OK trial with accuracy %.1f +- %.1f' %
                         (100 * (1 - loss), 100 * np.sqrt(lossvar)))
                else:
                    loss = 1 - r2_score(cv_y_pool, cv_pred_pool)
                    lossvar = None  # variance of R2 is undefined.
                    info('OK trial with R2 score %.2e' % (1 - loss))
            else:
                # Use a user specified loss function
                loss = loss_fn(cv_y_pool, cv_pred_pool)
                lossvar = None
                info('OK trial with loss %.1f' % loss)
            t_done = time.time()
            rval = {
                'loss':
                loss,
                'loss_variance':
                lossvar,
                'learner':
                untrained_learner,
                'preprocs':
                preprocessings,
                'ex_preprocs':
                ex_pps_list,
                'status':
                hyperopt.STATUS_OK,
                'duration':
                t_done - t_start,
                'iterations': (cv_n_iters.max() if
                               (hasattr(learner, "partial_fit")
                                and use_partial_fit) else None),
            }
            rtype = 'return'
        # The for loop exit with break, one fold did not finish running.
        if learner is None:
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': 'Not enough time to finish training on \
                            all CV folds',
                'duration': t_done - t_start,
            }
            rtype = 'return'

    ##==== Cost function exception handling ====##
    except (NonFiniteFeature, ) as exc:
        print('Failing trial due to NaN in', str(exc))
        t_done = time.time()
        rval = {
            'status': hyperopt.STATUS_FAIL,
            'failure': str(exc),
            'duration': t_done - t_start,
        }
        rtype = 'return'

    except (ValueError, ) as exc:
        if ('k must be less than or equal'
                ' to the number of training points') in str(exc):
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': str(exc),
                'duration': t_done - t_start,
            }
            rtype = 'return'
        else:
            rval = exc
            rtype = 'raise'

    except (AttributeError, ) as exc:
        print('Failing due to k_means_ weirdness')
        if "'NoneType' object has no attribute 'copy'" in str(exc):
            # -- sklearn/cluster/k_means_.py line 270 raises this sometimes
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': str(exc),
                'duration': t_done - t_start,
            }
            rtype = 'return'
        else:
            rval = exc
            rtype = 'raise'

    except Exception as exc:
        rval = exc
        rtype = 'raise'

    # -- return the result to calling process
    _conn.send((rtype, rval))
Exemplo n.º 5
0
    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=3, random_state=2)
    )

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=4, random_state=0)
    )

    cv = cls(n_splits=3)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == 3

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == 3


@pytest.mark.parametrize("cvs", [(LeaveOneOut(),), (LeavePOut(2), LeavePOut(3))])
def test_leave_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol

    with assert_dask_compute(False):
Exemplo n.º 6
0
def pca_graph_pvals_less_than():

    data = preproccessed_data.join(mapping_file[[
        'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
        'DiagnosisGroup'
    ]])
    X = data.drop([
        'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
        'DiagnosisGroup'
    ],
                  axis=1)

    y = data['DiagnosisGroup']

    for n_comp in range(2, 30):
        pcas.append(n_comp)

        loo = LeaveOneOut()

        y_pred_list = []
        auc = []
        auc_train = []
        for train_index, test_index in loo.split(X):
            train_index = list(train_index)
            # print("%s %s" % (train_index, test_index))
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            most_corelated_taxon = {}
            for i in range(X_train.shape[1]):
                p_val = scipy.stats.spearmanr(X_train.iloc[:, i], y_train)[1]
                if math.isnan(p_val):
                    most_corelated_taxon[X_train.columns[i]] = 1
                else:
                    most_corelated_taxon[X_train.columns[i]] = p_val
            sorted_taxon = sorted(most_corelated_taxon.items(),
                                  key=operator.itemgetter(1))
            most_corelated_taxon = [i for i in sorted_taxon if i[1] <= 0.01]
            bact = [i[0] for i in most_corelated_taxon if i[0] != 1]
            new_data = X[bact]

            otu_after_pca, _ = apply_pca(new_data, n_components=n_comp)

            new_data = otu_after_pca.join(data[[
                'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
                'DiagnosisGroup'
            ]],
                                          how='inner')

            X_new = new_data.drop(['DiagnosisGroup'], axis=1)
            y_new = new_data['DiagnosisGroup']
            regex = re.compile(r"\[|\]|<", re.IGNORECASE)
            X_new.columns = [
                regex.sub("_", col) if any(x in str(col)
                                           for x in set(('[', ']',
                                                         '<'))) else col
                for col in X_new.columns.values
            ]

            X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
            y_train, y_test = y_new[train_index], y_new[test_index]

            model = XGBClassifier(max_depth=4,
                                  n_estimators=150,
                                  learning_rate=15 / 100,
                                  objective='multi:softmax')
            #objective='binary:logistic',
            #scale_pos_weight=(np.sum(y_train == -1) / np.sum(y_train == 1)))
            model.fit(X_train, y_train)
            pred_train = model.predict(X_train)
            auc_train.append(metrics.accuracy_score(y_train, pred_train))
            y_pred = model.predict(X_test)
            y_pred_list.append(y_pred[0])
        try:
            auc = metrics.accuracy_score(y, y_pred_list)
        except:
            pass
        print('PCA components' + str(n_comp), round(auc, 2))
        scores = round(auc, 2)
        scores_train = round(np.array(auc_train).mean(), 2)
        train_accuracy.append(scores_train)
        test_accuracy.append(round(scores.mean(), 2))
Exemplo n.º 7
0
## 9.6 他の分類手法

### 9.6.1 K最近傍法

import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species

my_scores = cross_val_score(KNeighborsClassifier(), X, y, cv=LeaveOneOut())
my_scores.mean()
#> 0.9666666666666667

### 9.6.2 ニューラルネットワーク

import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species

my_pipeline = Pipeline([('sc', StandardScaler()),  # 標準化
                        ('mlp', MLPClassifier())]) # ニューラルネットワーク
my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(), n_jobs=-1)
my_scores.mean()
#> 0.9533333333333334
Exemplo n.º 8
0
def cross_validation_cc():
    ac = loadmat('./data/component_contribution_python.mat')

    S = ac['train_S']

    df_S = pd.DataFrame(ac['train_S'])
    df_S_unique = df_S.T.drop_duplicates().T
    unque_cols = df_S_unique.columns.values.tolist()
    S = S[:, unque_cols]

    G = ac['G']
    # b = ac['b']

    b_list = json.load(open('./data/median_b.json'))
    b = np.asarray(b_list)
    b = np.reshape(b, (-1, 1))

    # w = ac['w']

    # pdb.set_trace()

    m, n = S.shape
    assert G.shape[0] == m
    assert b.shape == (n, 1)

    STG = np.dot(S.T, G)

    X = STG
    y = b

    # reg = LinearRegression(fit_intercept=False).fit(X, y)

    # y_pred = reg.predict(X)
    # print('Mean squared error: %.2f'
    #   % mean_squared_error(y, y_pred))
    # print('R2',reg.score(X, y))
    # # compare dG_gc with matlab
    # print reg.coef_

    # cross validation
    regression = LinearRegression(fit_intercept=False)
    # lasso = linear_model.Lasso()
    scores = -cross_val_score(
        regression, X, y, cv=LeaveOneOut(), scoring='neg_mean_absolute_error')
    # print scores
    # pdb.set_trace()
    print('median of cv is: ', median(scores))
    print('mean of cv is: ', mean(scores))

    print('std of cv is: ', scores.std)
    x = np.sort(scores)
    # y = np.arange(1,len(x)+1)/len(x)
    y = 1. * np.arange(len(x)) / (len(x) - 1)

    fig = plt.figure(figsize=(6, 6))
    plt.xlim(right=15)
    plt.plot(x, y, marker='.', linestyle='none')  #,color="#273c75")
    plt.axhline(y=0.5, linewidth=1, color='grey')
    plt.xlabel('|$\Delta G^{\'o}_{est} - \Delta G^{\'o}_{obs}$|')
    plt.ylabel('Cumulative distribution')
    fig.savefig('./figures/cross_validation_cc.jpg')
    plt.show()
Exemplo n.º 9
0
def IEM_cross_condition_l1out(testing_activity, testing_behaviour, decode_item, WM, WM_t, Inter, 
    tr_st, tr_end):
    ####
    ####
    #### IEM usando data de WM test
    #### IEM de aquellos TRs donde se use tambien training data (condiciones 1_7 y 2_7)
    #### En vez de hacer leave one out, que tarda mucho, o usar el mismo data (overfitting), hago k_fold, con 10 splits. 
    ####
    ####
    if decode_item == 'Target':
        dec_I = 'T'
    elif decode_item == 'Response':
        dec_I = 'A_R'
    elif decode_item == 'Distractor':
        dec_I = 'Dist'
    else:
        'Error specifying the decode item'
    ####
    #### Get the Trs with shared information and the TRs without shared information
    list_wm_scans= range(nscans_wm)  
    trs_shared = range(tr_st, tr_end)
    nope=[list_wm_scans.remove(tr_s) for tr_s in trs_shared]
    list_wm_scans2 = list_wm_scans
    ####
    #### Run the ones without shared information the same way
    testing_angles = np.array(testing_behaviour[dec_I])    # A_R # T # Dist
    ### Respresentation
    signal_paralel =[ testing_activity[:, i, :] for i in list_wm_scans2 ]
    Reconstructions = Parallel(n_jobs = numcores)(delayed(Representation)(signal, testing_angles, WM, WM_t, ref_angle=180, plot=False, intercept=Inter)  for signal in signal_paralel)    #### reconstruction standard (paralel)
    Reconstruction_indep = pd.concat(Reconstructions, axis=1) #mean of the reconstructions (all trials)
    Reconstruction_indep.columns =  [str(i * TR) for i in list_wm_scans2 ]    ##column names
    ####
    #### Run the ones with shared information: k fold
    Recons_dfs_shared=[]
    for shared_TR in trs_shared:
        testing_data= testing_activity[:, shared_TR, :]            
        reconstrction_sh=[]
        loo = LeaveOneOut();
        for train_index, test_index in loo.split(testing_data):
            X_train, X_test = testing_data[train_index], testing_data[test_index]
            y_train, y_test = testing_angles[train_index], testing_angles[test_index]
            ## train
            WM2, Inter2 = Weights_matrix_LM(X_train, y_train)
            WM_t2 = WM2.transpose()
            ## test
            rep_x = Representation(testing_data=X_test, testing_angles=y_test, Weights=WM2, Weights_t=WM_t2, ref_angle=180, plot=False, intercept=Inter2)
            reconstrction_sh.append(rep_x)
        ###
        reconstrction_sh = pd.concat(reconstrction_sh, axis=1) ##una al lado de la otra, de lo mismo, ahora un mean manteniendo indice
        reconstrction_sh_mean = reconstrction_sh.mean(axis = 1) #solo queda una columna con el mean de cada channel 
        Recons_dfs_shared.append(reconstrction_sh_mean)
    ####
    Reconstruction_shared = pd.concat(Recons_dfs_shared, axis=1)
    Reconstruction_shared.columns =  [str(i * TR) for i in trs_shared ]  
    #### 
    #### Merge both recosntructions dfs to get a single one
    Reconstruction = pd.concat([Reconstruction_indep, Reconstruction_shared], axis=1)
    ### sort the columns so the indep does not get at the end
    sorted_col = np.sort([float(Reconstruction.columns[i]) for i in range(len(Reconstruction.columns))])           
    sorted_col = [str(sorted_col[i]) for i in range(len(sorted_col))]
    Reconstruction = Reconstruction.reindex( sorted_col, axis=1)  
    #
    return Reconstruction
Exemplo n.º 10
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch Cell predict')

    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')

    parser.add_argument('--num-epochs', type=int, default=50,
                        help='number of training epochs (default: 50)')

    parser.add_argument('--model-path', default='../results/',
                        help='path to saved models (default: ../results/)')

    parser.add_argument('--data-dir', default='../data',
                        help='path to the directory with data set (default: ../data)')

    parser.add_argument('--experiment', default='exp_1',
                        help='name of the experiment (default: exp_1)')

    parser.add_argument('--description', default='',
                        help='description of the experiment (default: empty)')

    args = parser.parse_args()

    data_dir = args.data_dir
    model_path = args.model_path
    experiment = args.experiment
    description = args.description

    model_path = os.path.join(model_path, experiment)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    with open(os.path.join(model_path, 'description.txt'), 'w') as f:
        f.write("%s\n" % description)

    use_gpu = torch.cuda.is_available()

    data_transforms = {
        'train': {
                0:
        Compose([
        Rotate(15),
        CenterCrop(224, 224), 
        VerticalFlip(),
        HorizontalFlip(),
        HueSaturationValue(hue_shift_limit=50, sat_shift_limit=50, val_shift_limit=40),
        ToTensor()]),
    1:Compose([
        Rotate(15),
        CenterCrop(224, 224), 
        VerticalFlip(),
        HorizontalFlip(),
        HueSaturationValue(hue_shift_limit=50, sat_shift_limit=50, val_shift_limit=40),
        ToTensor()])},
        'val': {0: Compose([
        CenterCrop(224, 224),
        ToTensor()
    ]),
    1: Compose([
        CenterCrop(224, 224), 
        ToTensor()
    ])},
        'test': {0: Compose([
        CenterCrop(224, 224), 
        ToTensor()
    ]),
    1:Compose([
        CenterCrop(224, 224), 
        ToTensor()
    ])},
    }

    target_transform = change_classes
    loo = LeaveOneOut()

    folds = np.array(['fold_0','fold_1','fold_2'])

    device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for n, (tr, vl) in enumerate((list(loo.split(folds)))):
        if True:
            model_ft = pretrainedmodels.__dict__['resnet18'](num_classes=1000, pretrained='imagenet')
            model_ft.last_linear = nn.Linear(512, 2)
    
    
            if use_gpu:
                model_ft = model_ft.to(device)
            criterion = FocalLoss(gamma=0.3, alpha=None, size_average=False)
            params_to_train = model_ft.parameters()
            optimizer_ft = optim.Adam(params_to_train)
            plat_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft,
                                                                   'min', patience=3,
                                                                   factor=0.95, verbose=True)
            train_folds = folds[tr]
            val_folds = folds[vl]
            test_data = ['fold_3']
    
            dataloaders, dataset_sizes, class_names = loader(data_transforms, train_folds, val_folds,
                                                             test_data, data_dir, bs=args.batch_size,
                                                             target_transform=target_transform)
    
            model_ft, best_score = train_model(model_ft, criterion, optimizer_ft,
                                       plat_lr_scheduler,
                                       dataset_sizes=dataset_sizes,
                                       model_path=model_path,
                                       dataloaders=dataloaders,
                                       device=device, 
                                       num_epochs=args.num_epochs,
                                       fold_name=folds[vl][0], best='loss')
    
            torch.save(model_ft.state_dict(),
                       os.path.join(model_path, 'val_' + folds[vl][0] + '_f1_05_' + str(best_score).replace('.', '')))
            del criterion, optimizer_ft, plat_lr_scheduler
            torch.cuda.empty_cache()
            gc.collect()
Exemplo n.º 11
0
lr = LinearDiscriminantAnalysis()
lr.fit(X_train_pca, y_train)
y_pred = lr.predict(X_test_pca)
print("Accuracy score:{:.2f}".format(metrics.accuracy_score(y_test, y_pred)))

cm = metrics.confusion_matrix(y_test, y_pred)

plt.subplots(1, figsize=(16, 8))
sns.heatmap(cm)
plt.show()

print("Classification Results:\n{}".format(metrics.classification_report(y_test, y_pred)))

from sklearn.model_selection import LeaveOneOut

loo_cv = LeaveOneOut()
clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_pca, target, cv=loo_cv)
print("{} Leave One Out cross-validation mean accuracy score:{:.2f}".format(clf.__class__.__name__, cv_scores.mean()))

loo_cv = LeaveOneOut()
clf = LinearDiscriminantAnalysis()
cv_scores = cross_val_score(clf, X_pca, target, cv=loo_cv)
print("{} Leave One Out cross-validation mean accuracy score:{:.2f}".format(clf.__class__.__name__, cv_scores.mean()))

from sklearn.model_selection import GridSearchCV

params = {'penalty': ['l2'], 'C': np.logspace(0, 4, 10)}

clf = LogisticRegression()
Exemplo n.º 12
0
def plot_fit(
    results,
    phenotype,
    variable_type="binary",
    variable_name="phenotype",
    filename="fit_%s.html" % datetime.now().strftime("%Y%m%d"),
    flux_type="production",
    min_coef=0.001,
    atol=1e-6
):
    """Test for differential metabolite production.

    This will fit the `phenotype` response using L1-regularized linear models
    with log-fluxes as features. Will use LASSO regression for a continuous
    response and L1-regularized Logistic regression for a binary response.

    Parameters
    ----------
    results : micom.workflows.GrowthResults
        The results returned by the `grow` workflow.
    phenotype : pandas.Series
        The data to be fitted. Its index must correspond to `sample_id` in
        `exchanges`.
    variable_type : str of ["binary", "continuous"]
        The type of the variable.
    variable_name : str
        A short description of the phenotype for instance "disease_status".
    filename : str
        The HTML file where the visualization will be saved.
    flux_type : str of ["import", "production"]
        Whether to fit using import or production fluxes.
    min_coef : float in [0.0, Inf]
        Only report coefficient that are at least that large.
    atol : float
        Tolerance to consider a flux different from zero. Should be roughly equivalent
        to the solver tolerance.

    Returns
    -------
    Visualization
        A MICOM visualization. Can be served with `viz.view`.

    """
    exchanges = results.exchanges
    anns = results.annotations
    anns.index = anns.metabolite
    if flux_type == "import":
        exchanges = exchanges[
            (exchanges.taxon == "medium") & (exchanges.direction == "import")
        ]
        exchanges["flux"] = exchanges.flux.abs()
    else:
        exchanges = exchanges[
            (exchanges.taxon != "medium") & (exchanges.direction == "export")
        ]
        exchanges = (
            exchanges.groupby(["reaction", "metabolite", "sample_id"])
            .apply(
                lambda df: pd.Series(
                    {"flux": sum(df.abundance * df.flux.abs())}
                )
            )
            .reset_index()
        )
    exchanges = exchanges.loc[exchanges.flux > atol]
    if exchanges.shape[1] < 1:
        raise ValueError("None of the fluxes passed the tolerance threshold :(")
    if variable_type == "binary" and phenotype.nunique() != 2:
        raise ValueError(
            "Binary variables must have exactly two unique values, yours "
            "has: %s." % ", ".join(phenotype.unique())
        )
    elif variable_type == "continuous" and not is_numeric_dtype(phenotype):
        raise ValueError(
            "Continuous variables must have a numeric type, but yours is"
            " of type `%s`." % phenotype.dtype
        )
    elif variable_type not in ["binary", "continuous"]:
        raise ValueError(
            "Unsupported variable type. Must be either `binary` or "
            "`continuous`."
        )

    fluxes = exchanges.pivot_table(
        index="sample_id", columns="metabolite", values="flux", fill_value=atol
    )
    fluxes = fluxes.applymap(np.log)
    meta = phenotype[fluxes.index]
    stds = fluxes.std(axis=1)
    bad = stds < 1e-6
    if bad.any():
        logger.warning("Removing %d fluxes due to zero variance." % bad.sum())
        fluxes = fluxes.loc[:, ~bad]
    scaled = StandardScaler().fit_transform(fluxes)
    if variable_type == "binary":
        model = LogisticRegressionCV(
            penalty="l1",
            scoring="accuracy",
            solver="liblinear",
            cv=2,
            Cs=np.power(10.0, np.arange(-6, 6, 0.5)),
            max_iter=50000,
        )
        fit = model.fit(scaled, meta)
        model = LogisticRegression(
            penalty="l1", solver="liblinear", C=fit.C_[0], max_iter=10000,
        )
        fit = model.fit(scaled, meta)
        score = cross_val_score(model, X=scaled, y=meta, cv=LeaveOneOut())
        coefs = pd.DataFrame(
            {"coef": fit.coef_[0, :], "metabolite": fluxes.columns}
        )
    else:
        model = LassoCV(cv=2, max_iter=50000)
        fit = model.fit(scaled, meta)
        model = Lasso(alpha=fit.alpha_, max_iter=50000)
        fit = model.fit(scaled, meta)
        score = cross_val_score(model, X=scaled, y=meta, cv=3)
        coefs = pd.DataFrame({"coef": fit.coef_, "metabolite": fluxes.columns})
    coefs["description"] = anns.loc[coefs.metabolite, "name"].values
    score = [np.mean(score), np.std(score)]
    score.append(model.score(scaled, meta))

    if all(coefs.coef.abs() < min_coef):
        raise RuntimeError(
            "Unfortunately no metabolite flux was predictive for the "
            "chosen phenotype and a cutoff of %g :(" % min_coef
        )

    data = {"fluxes": exchanges, "coefficients": coefs}
    coefs = coefs[coefs.coef.abs() >= min_coef].sort_values(by="coef")
    predicted = cross_val_predict(model, scaled, meta, cv=LeaveOneOut())
    fitted = pd.DataFrame(
        {"real": meta, "predicted": predicted}, index=meta.index
    )

    exchanges = exchanges.loc[
        exchanges.metabolite.isin(coefs.metabolite.values)
    ]
    exchanges["meta"] = meta[exchanges.sample_id].values
    exchanges["description"] = anns.loc[exchanges.metabolite, "name"].values
    var_type = "nominal" if variable_type == "binary" else "quantitative"
    viz = Visualization(filename, data, "tests.html")

    viz.save(
        fitted=fitted.to_json(orient="records"),
        coefs=coefs.to_json(orient="records"),
        exchanges=exchanges.to_json(orient="records"),
        metabolites=json.dumps(coefs.metabolite.tolist()),
        variable=variable_name,
        type=var_type,
        score=score,
        width=400,
        height=300,
        cheight=max(2 * coefs.shape[0], 40),
        cwidth=max(8 * coefs.shape[0], 160),
    )

    return viz
def per_voxel_analysis(model, fmri_runs, design_matrices, subject, alpha_list):
    # compute alphas and test score with cross validation
    #   - fmri_runs: list of fMRI data runs (1 for each run)
    #   - design_matrices: list of design matrices (1 for each run)
    #   - nb_voxels: number of voxels
    #   - indexes: dict specifying row indexes for each run
    # n_sample = min(max(100 * design_matrices[0].shape[1], design_matrices[0].shape[0]), 8000)
    n_sample = params.n_sample
    nb_voxels = fmri_runs[0].shape[1]
    nb_alphas = len(alpha_list)
    nb_runs_test = len(fmri_runs)
    nb_runs_valid = nb_runs_test - 1
    alphas_cv2 = np.zeros((nb_runs_test, nb_voxels))
    scores_cv2 = np.zeros((nb_runs_test, nb_voxels))
    distribution_array = np.zeros((nb_runs_test, n_sample, nb_voxels))

    # loop for r2 computation
    cv3 = 0
    logo = LeaveOneOut()  # leave on run out !
    columns_index = np.arange(design_matrices[0].shape[1])
    shuffling = []
    for _ in range(n_sample):
        np.random.shuffle(columns_index)
        shuffling.append(columns_index)
    for train_, test in logo.split(fmri_runs):
        fmri_data_train_ = [
            fmri_runs[i] for i in train_
        ]  # fmri_runs liste 2D colonne = voxels et chaque row = un t_i
        predictors_train_ = [design_matrices[i] for i in train_]

        cv2 = 0
        logo2 = LeaveOneOut()  # leave on run out !
        for train, valid in logo2.split(fmri_data_train_):
            fmri_data_train = [
                fmri_data_train_[i] for i in train
            ]  # fmri_runs liste 2D colonne = voxels et chaque row = un t_i
            predictors_train = [predictors_train_[i] for i in train]
            dm = np.vstack(predictors_train)
            fmri = np.vstack(fmri_data_train)
            scores_cv1 = np.zeros((nb_voxels, nb_runs_valid, nb_alphas))

            cv1 = 0
            for alpha_tmp in tqdm(
                    alpha_list
            ):  # compute the r2 for a given alpha for all the voxel
                start = time()
                model.set_params(alpha=alpha_tmp)
                model_fitted = model.fit(dm, fmri)
                # to delete
                with open(
                        os.path.join(
                            "/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/ridge-indiv/english/sub-057/yaml_files",
                            'fitting_time.txt'), 'a+') as f:
                    f.write(
                        'alpha = {}- Fitted in {} s on chris station.'.format(
                            alpha_tmp,
                            time() - start))
                    f.write('\n')
                # end of to delete
                r2 = get_r2_score(model_fitted, fmri_data_train_[valid[0]],
                                  predictors_train_[valid[0]])
                scores_cv1[:, cv2, cv1] = r2
                cv1 += 1
            cv2 += 1
        best_alphas_indexes = np.argmax(np.mean(scores_cv1, axis=1), axis=1)
        alphas_cv2[cv3, :] = np.array(
            [alpha_list[i] for i in best_alphas_indexes])
        fmri2 = np.vstack(fmri_data_train_)
        dm2 = np.vstack(predictors_train_)
        for voxel in tqdm(
                range(nb_voxels)
        ):  # loop through the voxels and fit the model with the best alpha for this voxel
            y = fmri2[:, voxel].reshape((fmri2.shape[0], 1))
            model.set_params(alpha=alphas_cv2[cv3, voxel])
            model_fitted = model.fit(dm2, y)
            # scores_cv2[cv3, voxel] = get_r2_score(model_fitted,
            #                                 fmri_runs[test[0]][:,voxel].reshape((fmri_runs[test[0]].shape[0],1)),
            #                                 design_matrices[test[0]])
            r2, distribution = sample_r2(
                model_fitted,
                design_matrices[test[0]],
                fmri_runs[test[0]][:, voxel].reshape(
                    (fmri_runs[test[0]].shape[0], 1)),
                shuffling=shuffling,
                n_sample=n_sample,
                alpha_percentile=params.alpha_percentile,
                test=True)
            scores_cv2[cv3, voxel] = r2[0]
            distribution_array[cv3, :, voxel] = distribution

            # log the results
            # log(subject, voxel=voxel, alpha=alphas_cv2[cv3, voxel], r2=scores_cv2[cv3, voxel])
        cv3 += 1

    return alphas_cv2, scores_cv2, distribution_array  # 2D arrays : (nb_runs_test, nb_voxels)
Exemplo n.º 14
0
def _plot_kde_vs_gaussian_absolute(x,
                                   ks2samp_txt,
                                   plname,
                                   err_prediction_seconds,
                                   manualkdebandwidth=None):

    # get the kernel bandwidth for the KDE
    if not manualkdebandwidth:
        bandwidths = 10**np.linspace(-1, 1, 100)
        params = {'bandwidth': bandwidths}
        grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                            params,
                            cv=LeaveOneOut())
        grid.fit(x[:, None])

        print('ran grid search for best kernel width.')
        bandwidth = grid.best_params_['bandwidth']
        print('got {:.3g}'.format(bandwidth))
    else:
        bandwidth = manualkdebandwidth

    # instantiate and fit the KDE model
    kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
    kde.fit(x[:, None])

    # score_samples returns the log of the probability density
    meanerr = np.mean(err_prediction_seconds)
    x_d = np.linspace(-20 * meanerr, 20 * meanerr, num=1000)
    logprob = kde.score_samples(x_d[:, None])

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.fill_between(x_d, np.exp(logprob), alpha=0.5, label='KDE from data')
    ax.plot(x, np.full_like(x, 0), '|k', markeredgewidth=1, label='data')

    ax.plot(x_d,
            norm.pdf(x_d, loc=0, scale=meanerr),
            label='gaussian, $\mu=0$, $\sigma={:.3g} sec$'.format(meanerr))

    sigtxt = ('1$\sigma$ error in prediction: {:.1f} seconds'.format(meanerr))
    if not manualkdebandwidth:
        txt = (
            'leaveoneout x-validated KDE bandwidth: {:.3g}\n{:s}\n{:s}'.format(
                bandwidth, ks2samp_txt, sigtxt))
    else:
        txt = ('manually selected KDE bandwidth: {:.3g} seconds\n{:s}\n{:s}'.
               format(bandwidth, ks2samp_txt, sigtxt))
    ax.text(0.02,
            0.98,
            txt,
            transform=ax.transAxes,
            color='gray',
            fontsize='xx-small',
            va='top',
            ha='left')

    ax.set_xlabel('Observed - Prediction [seconds]')

    if plname == 'WASP-18b':
        loc = 'center left'
    else:
        loc = 'best'
    ax.legend(loc=loc, fontsize='x-small')

    ax.get_yaxis().set_tick_params(which='both', direction='in')
    ax.get_xaxis().set_tick_params(which='both', direction='in')
    fig.tight_layout(h_pad=0, w_pad=0)

    ax.set_xlim([np.mean(x) - 10 * np.std(x), np.mean(x) + 10 * np.std(x)])

    savdir = '../results/verify_tess_timestamps/'
    savname = '{:s}_kde_vs_gaussian_absolute.png'.format(plname)
    savpath = os.path.join(savdir, savname)

    fig.tight_layout()
    fig.savefig(savpath, bbox_inches='tight', dpi=400)
    print('saved {:s}'.format(savpath))
    def grid_search_loocv_SMOTE(self,
                                X_train,
                                y_train,
                                X_test,
                                y_test,
                                y_test_patients,
                                params,
                                Classifier,
                                oversampling=False,
                                pos_label=1,
                                average='macro'):

        # kf = KFold(n)
        loo = LeaveOneOut()

        # y_pred = list()
        print('---')

        best_f1_score = 0.0
        best_config = None

        for configuration in ParameterGrid(params):
            # myFunction(**configuration)

            # print('SMOTE config:',configuration)

            clf = Classifier(**configuration)
            y_pred = list()

            # Leave one out
            for train_indices, test_indices in loo.split(X_train):
                X_train_curr = X_train[train_indices]
                X_test_curr = X_train[test_indices]
                y_train_curr = y_train[train_indices]
                y_test_curr = y_train[test_indices]

                ## print('--- SMOTE ---')
                # print(X_train_curr.shape, y_train_curr.shape)
                # unique, counts = np.unique(y_train_curr, return_counts=True)
                # print(np.asarray((unique, counts)).transpose())
                sampler = SMOTE(random_state=42)
                X_train_curr, y_train_curr = sampler.fit_resample(
                    X_train_curr, y_train_curr)
                # print(X_train_curr.shape, y_train_curr.shape)
                # unique, counts = np.unique(y_train_curr, return_counts=True)
                # print(np.asarray((unique, counts)).transpose())
                ## print('------')

                clf.fit(X_train_curr, y_train_curr)
                y_pred_curr = clf.predict(X_test_curr)
                y_pred.append(y_pred_curr)

            y_pred = np.array(y_pred)
            f1 = f1_score(y_train,
                          y_pred,
                          pos_label=pos_label,
                          average=average)
            # print('clf f1-weighted', f1)
            if f1 > best_f1_score:
                best_f1_score = f1
                best_config = configuration

        print('\nBest configuration found:', best_config)
        print('With f1-score ' + str(average) + ':', best_f1_score)
        clf = Classifier(**best_config)

        sampler = SMOTE(random_state=42)
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1_weighted = f1_score(y_test,
                               y_pred,
                               pos_label=pos_label,
                               average='weighted')

        p_macro, r_macro, f1_macro, s = precision_recall_fscore_support(
            y_test, y_pred, average='macro')

        p_none, r_none, f1_none, s_none = precision_recall_fscore_support(
            y_test, y_pred, average=None)
        # print('NUOVE:',p,r,f1)

        conf_mat = confusion_matrix(y_test, y_pred)
        print('CONF\n', conf_mat)
        print('\n')

        print('\nEVALUATION ON TEST SET:')
        print('f1-score (weighted) ' + str(average) + ':', f1_weighted)
        print('accuracy:', accuracy)

        # --- Compute # of PD patients correctly recalled
        n_pd_patients_recalled = 0
        n_pd_patients = 0
        n_healthy_patients_recalled = 0
        n_healthy_patients = 0
        unique_names = {name for name in y_test_patients}

        for name in unique_names:
            if len(y_pred[(y_test_patients == name) & (y_test == 1)]) > 0:
                curr_mean = np.mean(y_pred[(y_test_patients == name)
                                           & (y_test == 1)])
                print('PD patient:', curr_mean)
                if curr_mean > 0.5:
                    n_pd_patients_recalled += 1
                n_pd_patients += 1
            else:
                curr_mean = 1 - np.mean(
                    y_pred[(y_test_patients == name) & (y_test == 0)])
                print('Healty patient:', curr_mean)
                if curr_mean > 0.5:
                    n_healthy_patients_recalled += 1
                n_healthy_patients += 1

        print('# PD patients correctly recalled:',
              str(n_pd_patients_recalled) + '/' + str(n_pd_patients))
        print('# Healthy patients correctly recalled:',
              str(n_healthy_patients_recalled) + '/' + str(n_healthy_patients))

        print('\n----\n----\n')

        print('CLASSIFIER', self.DO_THIS_CLASSIFIER)
        print('PCA', self.DO_PCA)
        print('SMOTE', self.DO_SMOTE)

        print('accuracy', 'f1_weighted', 'f1_macro', 'precision_macro',
              'recall_macro', '#PDPatientsRecalled',
              '#HealthyPatientsRecalled, p1, r1, p0, r0, f1, f0')
        print([
            round(accuracy, 4),
            round(f1_weighted, 4),
            round(f1_macro, 4),
            round(p_macro, 4),
            round(r_macro, 4), n_pd_patients_recalled,
            n_healthy_patients_recalled,
            round(p_none[1], 4),
            round(r_none[1], 4),
            round(p_none[0], 4),
            round(r_none[0], 4),
            round(f1_none[1], 4),
            round(f1_none[0], 2)
        ])

        print('\n----\n----\n')

        print('\n--------\nEND GRID-SEARCH\n--------\n')

        return
Exemplo n.º 16
0
def IEM_cross_condition_l1out_shuff(testing_activity, testing_behaviour, decode_item, WM, WM_t, Inter, condition, subject, region,
    iterations, tr_st, tr_end, ref_angle=180):
    ####
    ####
    #### IEM usando data de WM test
    #### IEM de aquellos TRs donde se use tambien training data (condiciones 1_7 y 2_7)
    #### En vez de hacer leave one out, que tarda mucho, o usar el mismo data (overfitting), hago k_fold, con 10 splits. 
    #### Pongo el shuffle al principio segun el numero de iterations
    ####
    ####
    if decode_item == 'Target':
        dec_I = 'T'
    elif decode_item == 'Response':
        dec_I = 'A_R'
    elif decode_item == 'Distractor':
        dec_I = 'Dist'
    else:
        'Error specifying the decode item'
    ####
    #### Get the Trs with shared information and the TRs without shared information
    list_wm_scans= range(nscans_wm)  
    trs_shared = range(tr_st, tr_end)
    nope=[list_wm_scans.remove(tr_s) for tr_s in trs_shared]
    list_wm_scans2 = list_wm_scans
    ####
    #### Run the ones without shared information the same way
    testing_angles = np.array(testing_behaviour[dec_I])    # A_R # T # Dist
    Reconstructions_shuffled=[]
    for It in range(iterations):
        testing_angles_suhff = np.array([random.choice([0, 90, 180, 270]) for i in range(len(testing_angles))]) 
        signal_paralel =[ testing_activity[:, i, :] for i in list_wm_scans2 ]
        Reconstructions = Parallel(n_jobs = numcores)(delayed(Representation)(signal, testing_angles_suhff, WM, WM_t, ref_angle=180, plot=False, intercept=Inter)  for signal in signal_paralel)    #### reconstruction standard (paralel)
        Reconstruction_indep = pd.concat(Reconstructions, axis=1) #mean of the reconstructions (all trials)
        Reconstruction_indep.columns =  [str(i * TR) for i in list_wm_scans2 ]    ##column names
        ###
        #### Run the ones with shared information: k fold
        Recons_dfs_shared=[]
        for shared_TR in trs_shared:
            testing_data= testing_activity[:, shared_TR, :] 
            reconstrction_sh=[]
            loo = LeaveOneOut();
            for train_index, test_index in loo.split(testing_data):
                X_train, X_test = testing_data[train_index], testing_data[test_index]
                y_train, y_test = testing_angles[train_index], testing_angles[test_index] ##aqui no mezclas, ya que antes WM t WM_t no estanba trained en shuffled data
                ## train
                WM2, Inter2 = Weights_matrix_LM(X_train, y_train);
                WM_t2 = WM2.transpose();
                ## do the suffle here!
                y_test = np.array([random.choice([0, 90, 180, 270]) for i in range(len(y_test))]) 
                ## test
                rep_x = Representation(testing_data=X_test, testing_angles=y_test, Weights=WM2, Weights_t=WM_t2, ref_angle=180, plot=False, intercept=Inter2)
                reconstrction_sh.append(rep_x)
            ###
            reconstrction_sh = pd.concat(reconstrction_sh, axis=1) ##una al lado de la otra, de lo mismo, ahora un mean manteniendo indice
            reconstrction_sh_mean = reconstrction_sh.mean(axis = 1) #solo queda una columna con el mean de cada channel 
            Recons_dfs_shared.append(reconstrction_sh_mean)
        ####
        Reconstruction_shared = pd.concat(Recons_dfs_shared, axis=1)
        Reconstruction_shared.columns =  [str(i * TR) for i in trs_shared ]   
        #### 
        #### Merge both recosntructions dfs to get a single one
        Reconstruction = pd.concat([Reconstruction_indep, Reconstruction_shared], axis=1)
        ### sort the columns so the indep does not get at the end
        sorted_col = np.sort([float(Reconstruction.columns[i]) for i in range(len(Reconstruction.columns))])           
        sorted_col = [str(sorted_col[i]) for i in range(len(sorted_col))]
        Reconstruction = Reconstruction.reindex( sorted_col, axis=1)  
        #      
        Reconstructions_shuffled.append(Reconstruction)
        ##
    ######
    ###### Coger solo lo que te interesa
    ### Get just the supposed target location
    df_shuffle=[]
    for i in range(len(Reconstructions_shuffled)):
        n = Reconstructions_shuffled[i].iloc[ref_angle*2, :] #around the ref_angle (x2 beacuse now we have 720 instead of 360)
        n = n.reset_index()
        n.columns = ['times', 'decoding']
        n['decoding'] = [sum(Reconstructions_shuffled[i].iloc[:, ts] * f2(ref_angle)) for ts in range(len(n))] #population vector method (scalar product)
        n['times']=n['times'].astype(float)
        n['region'] = region
        n['subject'] = subject
        n['condition'] = condition
        df_shuffle.append(n) #save thhis
    
    ##
    df_shuffle = pd.concat(df_shuffle)    #same shape as the decosing of the signal
    return df_shuffle
    def grid_search_loocv(self,
                          X_train,
                          y_train,
                          X_test,
                          y_test,
                          y_test_patients,
                          params,
                          Classifier,
                          oversampling=False,
                          pos_label=1,
                          average='macro',
                          columns_new=[]):

        best_f1_score = 0.0
        best_config = None

        all_scores = []

        for configuration in ParameterGrid(params):
            print(configuration)

            clf = Classifier(**configuration)

            y_pred_val = cross_val_predict(clf,
                                           X_train,
                                           y_train,
                                           cv=LeaveOneOut())

            f1 = f1_score(y_train,
                          y_pred_val,
                          pos_label=pos_label,
                          average=average)
            if f1 > best_f1_score:
                best_f1_score = f1
                best_config = configuration
            all_scores.append(f1)

        print('all scores:', all_scores)

        print('\nBest configuration found:', best_config)
        print('With f1-score ' + str(average) + ':', best_f1_score)
        clf = Classifier(**best_config)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1_weighted = f1_score(y_test,
                               y_pred,
                               pos_label=pos_label,
                               average='weighted')

        p_macro, r_macro, f1_macro, s = precision_recall_fscore_support(
            y_test, y_pred, average='macro')

        p_none, r_none, f1_none, s_none = precision_recall_fscore_support(
            y_test, y_pred, average=None)
        # print('NUOVE:',p,r,f1)

        conf_mat = confusion_matrix(y_test, y_pred)
        print('CONF\n', conf_mat)
        print('\n')

        print('\nEVALUATION ON TEST SET:')
        print('f1-score (weighted) ' + str(average) + ':', f1_weighted)
        print('accuracy:', accuracy)

        # --- Compute # of PD patients correctly recalled
        n_pd_patients_recalled = 0
        n_pd_patients = 0
        n_healthy_patients_recalled = 0
        n_healthy_patients = 0
        unique_names = {name for name in y_test_patients}

        for name in unique_names:
            if len(y_pred[(y_test_patients == name) & (y_test == 1)]) > 0:
                curr_mean = np.mean(y_pred[(y_test_patients == name)
                                           & (y_test == 1)])
                print('PD patient:', curr_mean)
                if curr_mean > 0.5:
                    n_pd_patients_recalled += 1
                n_pd_patients += 1
            else:
                curr_mean = 1 - np.mean(
                    y_pred[(y_test_patients == name) & (y_test == 0)])
                print('Healty patient:', curr_mean)
                if curr_mean > 0.5:
                    n_healthy_patients_recalled += 1
                n_healthy_patients += 1

        print('# PD patients correctly recalled:',
              str(n_pd_patients_recalled) + '/' + str(n_pd_patients))
        print('# Healthy patients correctly recalled:',
              str(n_healthy_patients_recalled) + '/' + str(n_healthy_patients))

        print('\n----\n----\n')

        print('CLASSIFIER', self.DO_THIS_CLASSIFIER)
        print('PCA', self.DO_PCA)
        print('SMOTE', self.DO_SMOTE)

        print('accuracy', 'f1_weighted', 'f1_macro', 'precision_macro',
              'recall_macro', '#PDPatientsRecalled',
              '#HealthyPatientsRecalled, p1, r1, p0, r0, f1, f0')
        print([
            round(accuracy, 4),
            round(f1_weighted, 4),
            round(f1_macro, 4),
            round(p_macro, 4),
            round(r_macro, 4), n_pd_patients_recalled,
            n_healthy_patients_recalled,
            round(p_none[1], 4),
            round(r_none[1], 4),
            round(p_none[0], 4),
            round(r_none[0], 4),
            round(f1_none[1], 4),
            round(f1_none[0], 2)
        ])

        print('\n----\n----\n')

        # coeff_values = pd.DataFrame({'Coefficient value': clf.coef_[0], 'Features': columns_new})
        # coeff_values.sort_values(by=['Coefficient value'], inplace=True, ascending=False)

        # fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6,4.5))
        # ax = sns.barplot("Coefficient value", "Features", data=coeff_values,
        # 		               palette="Blues_d")
        # plt.show()

        print('\n--------\nEND GRID-SEARCH\n--------\n')

        #print(export_graphviz(clf, feature_names=columns_new, out_file='tree.dot'))
        # dot_data = export_graphviz(clf, feature_names=columns_new)
        # graph = graphviz.Source(dot_data)
        # graph.render("ultimo_tree")

        return
Exemplo n.º 18
0
def main():

    # Empty lists for storing masses
    fit_mass_allbands = []
    fit_mass_allbands_err = []

    fit_mass_ubriz = []
    fit_mass_ubriz_err = []

    fit_mass_briz = []
    fit_mass_briz_err = []

    # List for storing redshifts
    redshifts = []

    for field in ['North', 'South']:

        # Read in catalog from Lou
        if 'North' in field:
            df = pandas.read_pickle(adap_dir +
                                    'GOODS_North_SNeIa_host_phot.pkl')
            key = 'ID'

        elif 'South' in field:
            df = pandas.read_pickle(adap_dir +
                                    'GOODS_South_SNeIa_host_phot.pkl')
            key = 'Seq'

        # Loop over all of our objects
        for i in range(len(df)):

            # Now read in the fitting results and get our stellar masses
            galaxy_seq = df[key][i]

            h5file_allbands = adap_dir + "goodss_param_sfh/all_bands/" + "emcee_" + \
                              field + "_" + str(galaxy_seq) + ".h5"
            h5file_ubriz    = adap_dir + "goodss_param_sfh/ubriz/"     + "emcee_" + \
                              field + "_" + str(galaxy_seq) + ".h5"
            h5file_briz     = adap_dir + "goodss_param_sfh/briz/"      + "emcee_" + \
                              field + "_" + str(galaxy_seq) + ".h5"

            result_all, obs, _ = reader.results_from(h5file_allbands,
                                                     dangerous=False)
            result_ubriz, obs, _ = reader.results_from(h5file_ubriz,
                                                       dangerous=False)
            result_briz, obs, _ = reader.results_from(h5file_briz,
                                                      dangerous=False)

            cq_mass_all = get_cq_mass(result_all)
            cq_mass_ubriz = get_cq_mass(result_ubriz)
            cq_mass_briz = get_cq_mass(result_briz)

            # Append ot plotting arrays
            fit_mass_allbands.append(cq_mass_all[1])
            fit_mass_allbands_lowerr = cq_mass_all[1] - cq_mass_all[0]
            fit_mass_allbands_uperr = cq_mass_all[2] - cq_mass_all[1]
            fit_mass_allbands_err.append(
                [fit_mass_allbands_lowerr, fit_mass_allbands_uperr])

            fit_mass_ubriz.append(cq_mass_ubriz[1])
            fit_mass_ubriz_lowerr = cq_mass_ubriz[1] - cq_mass_ubriz[0]
            fit_mass_ubriz_uperr = cq_mass_ubriz[2] - cq_mass_ubriz[1]
            fit_mass_ubriz_err.append(
                [fit_mass_ubriz_lowerr, fit_mass_ubriz_uperr])

            fit_mass_briz.append(cq_mass_briz[1])
            fit_mass_briz_lowerr = cq_mass_briz[1] - cq_mass_briz[0]
            fit_mass_briz_uperr = cq_mass_briz[2] - cq_mass_briz[1]
            fit_mass_briz_err.append(
                [fit_mass_briz_lowerr, fit_mass_briz_uperr])

            redshifts.append(df['zbest'][i])

    # ---------Convert to numpy arrays and reshape
    fit_mass_allbands = np.array(fit_mass_allbands)
    fit_mass_allbands_err = np.array(fit_mass_allbands_err)
    fit_mass_allbands_err = fit_mass_allbands_err.reshape((2, 66))

    fit_mass_ubriz = np.array(fit_mass_ubriz)
    fit_mass_ubriz_err = np.array(fit_mass_ubriz_err)
    fit_mass_ubriz_err = fit_mass_ubriz_err.reshape((2, 66))

    fit_mass_briz = np.array(fit_mass_briz)
    fit_mass_briz_err = np.array(fit_mass_briz_err)
    fit_mass_briz_err = fit_mass_briz_err.reshape((2, 66))

    # --------
    xdata = np.log10(fit_mass_allbands)
    x_arr = np.arange(5.0, 13.0, 0.01)

    # ------------------ histogram and KDE
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ax.set_xlabel(r'$\mathrm{log(M_s)}$')
    ax.set_ylabel(r'$\mathrm{Normalized\ Density}$')

    from sklearn.neighbors import KernelDensity
    from sklearn.model_selection import GridSearchCV, LeaveOneOut

    from scipy.stats import gaussian_kde

    xdata_for_kde = xdata[:, None]
    x2 = np.log10(fit_mass_ubriz)[:, None]
    x3 = np.log10(fit_mass_briz)[:, None]
    x_arr_for_kde = x_arr[:, None]

    # ---- get bandwidth estimates
    bandwidths = 10**np.linspace(-1, 1, 100)
    grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                        {'bandwidth': bandwidths},
                        cv=LeaveOneOut())

    grid.fit(xdata_for_kde)
    bw1 = grid.best_params_['bandwidth']
    grid.fit(x2)
    bw2 = grid.best_params_['bandwidth']
    grid.fit(x3)
    bw3 = grid.best_params_['bandwidth']

    # Now estimate KDEs
    kde1 = KernelDensity(kernel='gaussian', bandwidth=0.25).fit(xdata_for_kde)
    log_dens1 = kde1.score_samples(x_arr_for_kde)
    kde2 = KernelDensity(kernel='gaussian', bandwidth=0.25).fit(x2)
    log_dens2 = kde2.score_samples(x_arr_for_kde)
    kde3 = KernelDensity(kernel='gaussian', bandwidth=0.25).fit(x3)
    log_dens3 = kde3.score_samples(x_arr_for_kde)

    # Plot KDEs
    ax.plot(x_arr,
            np.exp(log_dens1),
            color='k',
            lw=2.5,
            label='UV-Optical-NIR-MIR',
            zorder=2)
    ax.plot(x_arr,
            np.exp(log_dens2),
            color='mediumblue',
            lw=1.4,
            label='ubriz',
            zorder=1)
    ax.plot(x_arr,
            np.exp(log_dens3),
            color='darkturquoise',
            lw=1.4,
            label='briz',
            zorder=1)

    # KDEs using Scipy
    x1_kde = gaussian_kde(xdata)
    ax.plot(x_arr, x1_kde(x_arr), ls='--', color='k', lw=2.5, zorder=2)
    x2_kde = gaussian_kde(np.log10(fit_mass_ubriz))
    ax.plot(x_arr,
            x2_kde(x_arr),
            ls='--',
            color='mediumblue',
            lw=1.4,
            zorder=2)
    x3_kde = gaussian_kde(np.log10(fit_mass_briz))
    ax.plot(x_arr,
            x3_kde(x_arr),
            ls='--',
            color='darkturquoise',
            lw=1.4,
            zorder=2)

    ax.legend(loc=2, fontsize=10, frameon=False)

    ax.set_xlim(7.5, 12.5)

    fig.savefig(adap_dir + 'mass_dist.pdf', dpi=300, bbox_inches='tight')

    # ------------------ make residual figure
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)

    ax1.set_xlabel(r'$\mathrm{log(M_{s;\,all\ bands})}$')
    ax1.set_ylabel(
        r'$\mathrm{log(M_{s;\,all\ bands})  -  log(M_{s;\,(u)briz}) }$')

    deltamass1 = xdata - np.log10(fit_mass_ubriz)
    deltamass2 = xdata - np.log10(fit_mass_briz)

    xdata_err = np.empty((2, 66))
    deltamass1_err = np.empty((2, 66))
    deltamass2_err = np.empty((2, 66))

    for j in range(len(xdata)):
        xd = fit_mass_allbands[j]
        xdl = np.abs(np.log10(1 - fit_mass_allbands_err[0, j] / xd))
        xdu = np.log10(1 + fit_mass_allbands_err[1, j] / xd)
        xdata_err[:, j] = [xdl, xdu]

        val1 = fit_mass_ubriz[j]
        dm1l = np.abs(
            np.log10(1 - fit_mass_allbands_err[0, j] / xd) +
            np.log10(1 + fit_mass_ubriz_err[1, j] / val1))
        dm1u = np.log10(1 + fit_mass_allbands_err[1, j] /
                        xd) + np.log10(1 - fit_mass_ubriz_err[0, j] / val1)
        deltamass1_err[:, j] = [dm1l, dm1u]

        val2 = fit_mass_briz[j]
        dm2l = np.abs(
            np.log10(1 - fit_mass_allbands_err[0, j] / xd) +
            np.log10(1 + fit_mass_briz_err[1, j] / val2))
        dm2u = np.log10(1 + fit_mass_allbands_err[1, j] /
                        xd) + np.log10(1 - fit_mass_briz_err[0, j] / val2)
        deltamass2_err[:, j] = [dm2l, dm2u]

    ax1.axhline(y=0.0, ls='--', color='k', zorder=1)

    dm1_lbl = r'$\mathrm{log(M_{s;\,all}) - log(M_{s;\,ubriz})}$'
    dm2_lbl = r'$\mathrm{log(M_{s;\,all}) - log(M_{s;\,briz})}$'

    #ax1.errorbar(xdata, deltamass1, xerr=xdata_err, yerr=deltamass1_err,
    #    fmt='o', ms=2.0, elinewidth=1.0, ecolor='mediumblue',
    #    color='mediumblue', zorder=2, label=dm1_lbl)
    #ax1.errorbar(xdata, deltamass2, xerr=xdata_err, yerr=deltamass2_err,
    #    fmt='o', ms=2.0, elinewidth=1.0, ecolor='darkturquoise',
    #    color='darkturquoise', zorder=2, label=dm2_lbl)

    ax1.scatter(xdata,
                deltamass1,
                s=12,
                color='mediumblue',
                zorder=2,
                label=dm1_lbl)
    ax1.scatter(xdata,
                deltamass2,
                s=10,
                color='darkturquoise',
                zorder=2,
                label=dm2_lbl)

    # Fit a line to the points
    m1, b1 = np.polyfit(xdata, deltamass1, 1)
    m2, b2 = np.polyfit(xdata, deltamass2, 1)

    ax1.plot(x_arr, b1 + x_arr * m1, '--', color='mediumblue')
    ax1.plot(x_arr, b2 + x_arr * m2, '--', color='darkturquoise')

    print("Errors for the points and the line estimate --")

    ax1.legend(fontsize=10, frameon=False)
    ax1.set_xlim(6.8, 12.5)
    ax1.set_ylim(-1.6, 0.8)

    ax1.text(x=0.38,
             y=0.15,
             s=r'$\mathrm{Slope}\,=\,$' + "{:.2f}".format(m1),
             verticalalignment='top',
             horizontalalignment='left',
             transform=ax.transAxes,
             color='mediumblue',
             size=14)
    ax1.text(x=0.38,
             y=0.11,
             s=r'$\mathrm{Slope}\,=\,$' + "{:.2f}".format(m2),
             verticalalignment='top',
             horizontalalignment='left',
             transform=ax.transAxes,
             color='darkturquoise',
             size=14)

    fig1.savefig(adap_dir + 'mass_residuals.pdf', dpi=300, bbox_inches='tight')

    # --------------
    # Histograms of measurement significance
    allbands_sig = fit_mass_allbands / np.mean(fit_mass_allbands_err, axis=0)
    ubriz_sig = fit_mass_ubriz / np.mean(fit_mass_ubriz_err, axis=0)
    briz_sig = fit_mass_briz / np.mean(fit_mass_briz_err, axis=0)

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(111)

    # Resahpe data
    allbands_sig_kde = allbands_sig[:, None]
    ubriz_sig_kde = ubriz_sig[:, None]
    briz_sig_kde = briz_sig[:, None]

    xsig = np.arange(0.0, 10.0, 0.01)
    xsig_kde = xsig[:, None]

    # Estimate optimal bandwidth
    # I think I can use the same grid of bandwidths as before
    grid.fit(allbands_sig_kde)
    bw1 = grid.best_params_['bandwidth']

    print("BW1:", bw1)

    # Now estimate KDEs
    kde1 = KernelDensity(kernel='gaussian',
                         bandwidth=1.0).fit(allbands_sig_kde)
    log_dens1 = kde1.score_samples(xsig_kde)
    kde2 = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(ubriz_sig_kde)
    log_dens2 = kde2.score_samples(xsig_kde)
    kde3 = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(briz_sig_kde)
    log_dens3 = kde3.score_samples(xsig_kde)

    # Plot KDEs
    ax2.plot(xsig,
             np.exp(log_dens1),
             color='k',
             lw=2.5,
             label='UV-Optical-NIR-MIR',
             zorder=2)
    ax2.plot(xsig,
             np.exp(log_dens2),
             color='mediumblue',
             lw=1.4,
             label='ubriz',
             zorder=1)
    ax2.plot(xsig,
             np.exp(log_dens3),
             color='darkturquoise',
             lw=1.4,
             label='briz',
             zorder=1)

    plt.show()

    return None
Exemplo n.º 19
0
def cross_validate(cfg, featdata, cv_file=None):
    """
    Perform cross validation
    """
    # Init a classifier
    selected_classifier = cfg.CLASSIFIER['selected']
    if selected_classifier == 'GB':
        cls = GradientBoostingClassifier(
            loss='deviance',
            learning_rate=cfg.CLASSIFIER['GB']['learning_rate'],
            presort='auto',
            n_estimators=cfg.CLASSIFIER['GB']['trees'],
            subsample=1.0,
            max_depth=cfg.CLASSIFIER['GB']['depth'],
            random_state=cfg.CLASSIFIER['GB']['seed'],
            max_features='sqrt',
            verbose=0,
            warm_start=False)
    elif selected_classifier == 'XGB':
        cls = XGBClassifier(
            loss='deviance',
            learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'],
            presort='auto',
            n_estimators=cfg.CLASSIFIER['XGB']['trees'],
            subsample=1.0,
            max_depth=cfg.CLASSIFIER['XGB']['depth'],
            random_state=cfg.CLASSIFIER['XGB'],
            max_features='sqrt',
            verbose=0,
            warm_start=False)
    elif selected_classifier == 'RF':
        cls = RandomForestClassifier(
            n_estimators=cfg.CLASSIFIER['RF']['trees'],
            max_features='auto',
            max_depth=cfg.CLASSIFIER['RF']['depth'],
            n_jobs=cfg.N_JOBS,
            random_state=cfg.CLASSIFIER['RF']['seed'],
            oob_score=False,
            class_weight='balanced_subsample')
    elif selected_classifier == 'LDA':
        cls = LDA()
    elif selected_classifier == 'rLDA':
        cls = rLDA(cfg.CLASSIFIER['rLDA']['r_coeff'])
    else:
        logger.error('Unknown classifier type %s' % selected_classifier)
        raise ValueError

    # Setup features
    X_data = featdata['X_data']
    Y_data = featdata['Y_data']
    wlen = featdata['wlen']

    # Choose CV type
    ntrials, nsamples, fsize = X_data.shape
    selected_cv = cfg.CV_PERFORM['selected']
    if selected_cv == 'LeaveOneOut':
        logger.info_green('%d-fold leave-one-out cross-validation' % ntrials)
        if SKLEARN_OLD:
            cv = LeaveOneOut(len(Y_data))
        else:
            cv = LeaveOneOut()
    elif selected_cv == 'StratifiedShuffleSplit':
        logger.info_green(
            '%d-fold stratified cross-validation with test set ratio %.2f' %
            (cfg.CV_PERFORM[selected_cv]['folds'],
             cfg.CV_PERFORM[selected_cv]['test_ratio']))
        if SKLEARN_OLD:
            cv = StratifiedShuffleSplit(
                Y_data[:, 0],
                cfg.CV_PERFORM[selected_cv]['folds'],
                test_size=cfg.CV_PERFORM[selected_cv]['test_ratio'],
                random_state=cfg.CV_PERFORM[selected_cv]['seed'])
        else:
            cv = StratifiedShuffleSplit(
                n_splits=cfg.CV_PERFORM[selected_cv]['folds'],
                test_size=cfg.CV_PERFORM[selected_cv]['test_ratio'],
                random_state=cfg.CV_PERFORM[selected_cv]['seed'])
    else:
        logger.error('%s is not supported yet. Sorry.' %
                     cfg.CV_PERFORM[cfg.CV_PERFORM['selected']])
        raise NotImplementedError
    logger.info('%d trials, %d samples per trial, %d feature dimension' %
                (ntrials, nsamples, fsize))

    # Do it!
    timer_cv = qc.Timer()
    scores, cm_txt = crossval_epochs(cv,
                                     X_data,
                                     Y_data,
                                     cls,
                                     cfg.tdef.by_value,
                                     cfg.CV['BALANCE_SAMPLES'],
                                     n_jobs=cfg.N_JOBS,
                                     ignore_thres=cfg.CV['IGNORE_THRES'],
                                     decision_thres=cfg.CV['DECISION_THRES'])
    t_cv = timer_cv.sec()

    # Export results
    txt = 'Cross validation took %d seconds.\n' % t_cv
    txt += '\n- Class information\n'
    txt += '%d epochs, %d samples per epoch, %d feature dimension (total %d samples)\n' %\
        (ntrials, nsamples, fsize, ntrials * nsamples)
    for ev in np.unique(Y_data):
        txt += '%s: %d trials\n' % (cfg.tdef.by_value[ev],
                                    len(np.where(Y_data[:, 0] == ev)[0]))
    if cfg.CV['BALANCE_SAMPLES']:
        txt += 'The number of samples was balanced using %ssampling.\n' % cfg.BALANCE_SAMPLES.lower(
        )
    txt += '\n- Experiment condition\n'
    txt += 'Sampling frequency: %.3f Hz\n' % featdata['sfreq']
    txt += 'Spatial filter: %s (channels: %s)\n' % (cfg.SP_FILTER,
                                                    cfg.SP_CHANNELS)
    txt += 'Spectral filter: %s\n' % cfg.TP_FILTER[cfg.TP_FILTER['selected']]
    txt += 'Notch filter: %s\n' % cfg.NOTCH_FILTER[
        cfg.NOTCH_FILTER['selected']]
    txt += 'Channels: ' + ','.join(
        [str(featdata['ch_names'][p]) for p in featdata['picks']]) + '\n'
    txt += 'PSD range: %.1f - %.1f Hz\n' % (cfg.FEATURES['PSD']['fmin'],
                                            cfg.FEATURES['PSD']['fmax'])
    txt += 'Window step: %.2f msec\n' % (
        1000.0 * cfg.FEATURES['PSD']['wstep'] / featdata['sfreq'])
    if type(wlen) is list:
        for i, w in enumerate(wlen):
            txt += 'Window size: %.1f msec\n' % (w * 1000.0)
            txt += 'Epoch range: %s sec\n' % (cfg.EPOCH[i])
    else:
        txt += 'Window size: %.1f msec\n' % (cfg.FEATURES['PSD']['wlen'] *
                                             1000.0)
        txt += 'Epoch range: %s sec\n' % (cfg.EPOCH)
    txt += 'Decimation factor: %d\n' % cfg.FEATURES['PSD']['decim']

    # Compute stats
    cv_mean, cv_std = np.mean(scores), np.std(scores)
    txt += '\n- Average CV accuracy over %d epochs (random seed=%s)\n' % (
        ntrials, cfg.CV_PERFORM[cfg.CV_PERFORM['selected']]['seed'])
    if cfg.CV_PERFORM[cfg.CV_PERFORM['selected']] in [
            'LeaveOneOut', 'StratifiedShuffleSplit'
    ]:
        txt += "mean %.3f, std: %.3f\n" % (cv_mean, cv_std)
    txt += 'Classifier: %s, ' % selected_classifier
    if selected_classifier == 'RF':
        txt += '%d trees, %s max depth, random state %s\n' % (
            cfg.CLASSIFIER['RF']['trees'], cfg.CLASSIFIER['RF']['depth'],
            cfg.CLASSIFIER['RF']['seed'])
    elif selected_classifier == 'GB' or selected_classifier == 'XGB':
        txt += '%d trees, %s max depth, %s learing_rate, random state %s\n' % (
            cfg.CLASSIFIER['GB']['trees'], cfg.CLASSIFIER['GB']['depth'],
            cfg.CLASSIFIER['GB']['learning_rate'],
            cfg.CLASSIFIER['GB']['seed'])
    elif selected_classifier == 'rLDA':
        txt += 'regularization coefficient %.2f\n' % cfg.CLASSIFIER['rLDA'][
            'r_coeff']
    if cfg.CV['IGNORE_THRES'] is not None:
        txt += 'Decision threshold: %.2f\n' % cfg.CV['IGNORE_THRES']
    txt += '\n- Confusion Matrix\n' + cm_txt
    logger.info(txt)

    # Export to a file
    if 'export_result' in cfg.CV_PERFORM[selected_cv] and cfg.CV_PERFORM[
            selected_cv]['export_result'] is True:
        if cv_file is None:
            if cfg.EXPORT_CLS is True:
                qc.make_dirs('%s/classifier' % cfg.DATA_PATH)
                fout = open('%s/classifier/cv_result.txt' % cfg.DATA_PATH, 'w')
            else:
                fout = open('%s/cv_result.txt' % cfg.DATA_PATH, 'w')
        else:
            fout = open(cv_file, 'w')
        fout.write(txt)
        fout.close()
@author: likhith
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import LeaveOneOut

iris = datasets.load_iris()
X_new = iris.data[50:, 2:4]  #Not Considering setosa
#X
X = np.array([100, 2])  #array of size
X = (X_new - np.min(X_new, axis=0) / np.max(X_new, axis=0) -
     np.min(X_new, axis=0))  #scaling
splitS = LeaveOneOut()  #LeaveOut cross validation
X1 = splitS.get_n_splits(X)  #using Leaveout
y = iris.target[50:]
Y = []


def mismatchcal(j, y):
    mismatch = 0
    if j > 0.5:
        p = 1
    else:
        p = 0

    if y == p:
        mismatch += 1
    else:
Exemplo n.º 21
0
def main():
    parser = ArgumentParser()
    parser.add_argument('metatable',
                        type=str,
                        default='',
                        help='Get training set labels')
    parser.add_argument('--featurefile',
                        default='./products/feat.npz',
                        type=str,
                        help='Feature file')
    parser.add_argument('--outdir',
                        type=str,
                        default='./products/',
                        help='Path in which to save the LC data (single file)')
    parser.add_argument('--train',
                        action='store_true',
                        help='Train classification model')
    parser.add_argument('--savemodel',
                        action='store_true',
                        help='Save output model, training on full set')
    parser.add_argument('--add-random',
                        dest='add_random',
                        type=bool,
                        default=False,
                        help='Add random number as feature (for testing)')
    parser.add_argument('--calc-importance',
                        dest='calc_importance',
                        type=bool,
                        default=False,
                        help='Calculate feature importance')
    parser.add_argument('--only-raenn',
                        dest='only_raenn',
                        type=bool,
                        default=False,
                        help='Use ony RAENN features')
    parser.add_argument('--not-raenn',
                        dest='not_raenn',
                        type=bool,
                        default=False,
                        help='Exclude RAENN features')
    parser.add_argument('--no-int',
                        dest='no_int',
                        type=bool,
                        default=False,
                        help='Exclude integral features (for testing)')
    parser.add_argument(
        '--resampling',
        dest='resampling',
        type=str,
        default='KDE',
        help='Resampling methods. Either KDE or Gauss available')
    parser.add_argument('--modelfile',
                        dest='modelfile',
                        type=str,
                        default='model',
                        help='Name of model file to save')
    parser.add_argument('--randomseed',
                        type=int,
                        default=42,
                        help='Name of model file to save')
    parser.add_argument('--outfile',
                        dest='outfile',
                        type=str,
                        default='superprob',
                        help='Name of probability table file')
    args = parser.parse_args()

    sn_dict = {'SLSN': 0, 'SNII': 1, 'SNIIn': 2, 'SNIa': 3, 'SNIbc': 4}

    if args.train:
        X, y, names, means, stds, feature_names = prep_data_for_training(
            args.featurefile, args.metatable)
        names = np.asarray(names, dtype=str)
        if args.only_raenn:
            gind = [
                i for i, feat in enumerate(feature_names) if 'raenn' in feat
            ]
            X = X[:, gind]
            feature_names = feature_names[gind]

        if args.not_raenn:
            gind = [
                i for i, feat in enumerate(feature_names)
                if 'raenn' not in feat
            ]
            X = X[:, gind]
            feature_names = feature_names[gind]

        if args.no_int:
            gind = [
                i for i, feat in enumerate(feature_names) if 'int' not in feat
            ]
            X = X[:, gind]
            feature_names = feature_names[gind]

        if args.add_random:
            feature_names = np.append(feature_names, 'random')

        if not args.savemodel:
            loo = LeaveOneOut()
            y_pred = np.zeros(len(y))
            for train_index, test_index in loo.split(X):

                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                if args.resampling == 'Gauss':
                    X_res, y_res = Gauss_resample(X_train, y_train, 500)
                else:
                    X_res, y_res = KDE_resample(X_train, y_train, 500)

                new_ind = np.arange(len(y_res), dtype=int)
                np.random.shuffle(new_ind)
                X_res = X_res[new_ind]
                y_res = y_res[new_ind]

                if args.calc_importance:
                    X_res2, y_res2 = Gauss_resample(X_train, y_train, 500)
                    X_res2 = X_res2[:-40, :]
                    y_res2 = y_res2[:-40]

                if args.add_random:
                    X_res2, y_res2 = Gauss_resample(X_train, y_train, 500)
                    X_res2 = X_res2[:-40, :]
                    y_res2 = y_res2[:-40]
                    X_res = np.vstack((X_res.T, np.random.randn(len(X_res)))).T
                    X_res2 = np.vstack(
                        (X_res2.T, np.random.randn(len(X_res2)))).T
                    X_test = np.vstack(
                        (X_test.T, np.random.randn(len(X_test)))).T

                clf = RandomForestClassifier(n_estimators=400,
                                             max_depth=None,
                                             random_state=args.randomseed,
                                             criterion='gini',
                                             class_weight='balanced',
                                             max_features=None,
                                             oob_score=False)
                clf.fit(X_res, y_res)
                print(clf.predict_proba(X_test), y_test, names[test_index])

                if args.calc_importance:
                    feature_names = np.asarray(feature_names, dtype=str)
                    importances = clf.feature_importances_
                    indices = importances.argsort()[::-1]

                    print("Feature ranking:")

                    for f in range(X_res.shape[1]):
                        print(feature_names[indices[f]],
                              importances[indices[f]])

                    plt.ylabel("Feature importances")
                    plt.bar(range(X_res.shape[1]),
                            importances[indices],
                            color="grey",
                            align="center")
                    plt.xticks(np.arange(len(importances)) + 0.5,
                               feature_names[indices],
                               rotation=45,
                               ha='right')
                    plt.show()
                y_pred[test_index] = np.argmax(clf.predict_proba(X_test))
            cnf_matrix = confusion_matrix(y, y_pred)
            print(cnf_matrix)
        if args.savemodel:
            if args.resampling == 'Gauss':
                X_res, y_res = Gauss_resample(X, y, 500)
            else:
                X_res, y_res = KDE_resample(X, y, 500)

            new_ind = np.arange(len(y_res), dtype=int)
            np.random.shuffle(new_ind)
            X_res = X_res[new_ind]
            y_res = y_res[new_ind]

            clf = RandomForestClassifier(n_estimators=350,
                                         max_depth=None,
                                         random_state=args.randomseed,
                                         criterion='gini',
                                         class_weight='balanced',
                                         max_features=None,
                                         oob_score=False)
            clf.fit(X_res, y_res)

            # save the model to disk
            if not os.path.exists(args.outdir):
                os.makedirs(args.outdir)
            if args.outdir[-1] != '/':
                args.outdir += '/'
            pickle.dump([clf, means, stds],
                        open(
                            args.outdir + args.modelfile + '_' + date + '.sav',
                            'wb'))
            pickle.dump([clf, means, stds],
                        open(args.outdir + args.modelfile + '.sav', 'wb'))

    else:
        info = pickle.load(open(args.modelfile, 'rb'))
        loaded_model = info[0]
        means = info[1]
        stds = info[2]
        X, names, means, stds, feature_names = prep_data_for_classifying(
            args.featurefile, means, stds)
        names = np.asarray(names, dtype=str)
        probabilities = np.zeros((len(names), len(sn_dict)))
        for i, name in enumerate(names):
            probabilities[i] = loaded_model.predict_proba([X[i]])[0]
        probability_table = QTable(np.vstack((names, probabilities.T)).T,
                                   names=['Event Name', *sn_dict],
                                   meta={'name': 'SuperRAENN probabilities'})

        # save the model to disk
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
        if args.outdir[-1] != '/':
            args.outdir += '/'
        ascii.write(probability_table,
                    args.outdir + args.outfile + '.tex',
                    format='latex',
                    overwrite=True)
Exemplo n.º 22
0
y_train_drug = train_drug["param_3"].values
y_test_drug = test_drug["param_3"].values

print("Coefficient_3 ....")

print("Sigmoid KernelRidge")

param_tested_alphas = [1, 10, 50, 100, 500]
param_tested_gamma = [0.00001, 0.0001, 0.01, 0.1]
param_tested_coef0 = [0.01, 0.1, 0.5, 1]

param_grid = dict(alpha=param_tested_alphas,
                  gamma=param_tested_gamma,
                  coef0=param_tested_coef0)

splitter_loo = LeaveOneOut()
grid = GridSearchCV(KernelRidge(kernel="sigmoid"),
                    param_grid=param_grid,
                    cv=splitter_loo,
                    scoring="neg_mean_absolute_error")

results = pd.DataFrame()
results["COSMIC_ID"] = test_df_50["COSMIC_ID"]

grid.fit(Xtrain_drug, y_train_drug)

# Pick the best parameterds, train again and predict on the test data
model = KernelRidge(kernel="sigmoid",
                    alpha=grid.best_params_["alpha"],
                    gamma=grid.best_params_["gamma"],
                    coef0=grid.best_params_["coef0"])
Exemplo n.º 23
0
        cls(n_splits=3, random_state=0))

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=3, random_state=2))

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=4, random_state=0))

    cv = cls(n_splits=3)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == 3

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == 3


@pytest.mark.parametrize("cvs", [(LeaveOneOut(), ),
                                 (LeavePOut(2), LeavePOut(3))])
def test_leave_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Exemplo n.º 24
0
    def fit(self, X, y):
        """Fit the model using X as training data and y as target values.

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])

        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=not self.capabilities["multivariate"],
            coerce_to_numpy=True,
        )
        # Transpose to work correctly with distance functions
        X = X.transpose((0, 2, 1))

        y = np.asarray(y)
        check_classification_targets(y)
        # if internal cv is desired, the relevant flag forces a grid search
        # to evaluate the possible values,
        # find the best, and then set this classifier's params to match
        if self._cv_for_params:
            grid = GridSearchCV(
                estimator=KNeighborsTimeSeriesClassifier(distance=self.metric,
                                                         n_neighbors=1),
                param_grid=self._param_matrix,
                cv=LeaveOneOut(),
                scoring="accuracy",
            )
            grid.fit(X, y)
            self.distance_params = grid.best_params_["distance_params"]

        if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
            if y.ndim != 1:
                warnings.warn(
                    "IN TS-KNN: A column-vector y was passed when a 1d array "
                    "was expected. Please change the shape of y to "
                    "(n_samples, ), for example using ravel().",
                    DataConversionWarning,
                    stacklevel=2,
                )

            self.outputs_2d_ = False
            y = y.reshape((-1, 1))
        else:
            self.outputs_2d_ = True

        self.classes_ = []
        self._y = np.empty(y.shape, dtype=np.int)
        for k in range(self._y.shape[1]):
            classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes)

        if not self.outputs_2d_:
            self.classes_ = self.classes_[0]
            self._y = self._y.ravel()

        if hasattr(check_array, "__wrapped__"):
            temp = check_array.__wrapped__.__code__
            check_array.__wrapped__.__code__ = _check_array_ts.__code__
        else:
            temp = check_array.__code__
            check_array.__code__ = _check_array_ts.__code__
        #  this not fx = self._fit(X, self_y) in order to maintain backward
        # compatibility with scikit learn 0.23, where _fit does not take an arg y
        fx = self._fit(X)

        if hasattr(check_array, "__wrapped__"):
            check_array.__wrapped__.__code__ = temp
        else:
            check_array.__code__ = temp

        self._is_fitted = True
        return fx
Exemplo n.º 25
0
model.fit(X1, y1)

y2_model = model.predict(X2)
accuracy_score(y2, y2_model)

# %%
y2_model = model.fit(X1, y1).predict(X2)
y1_model = model.fit(X2, y2).predict(X1)
accuracy_score(y1, y1_model), accuracy_score(y2, y2_model)

# %%
cross_val_score(model, X, y, cv=5)

# %%
scores = cross_val_score(model, X, y, cv=LeaveOneOut())
scores

# %%
scores.mean()

# %%


def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))


# %%
def make_data(N, err=1.0, rseed=1):
Exemplo n.º 26
0
# kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
# for index, (train, test) in enumerate(kfold.split(x_data_all, y_data_all)):
#     x_train = data_util.scale(x_data_all.iloc[train])
#     x_test = data_util.scale(x_data_all.iloc[test])
#     y_train = y_data_all.iloc[train]
#     model, history= ann(x_train, y_train)
#     loss, acc = model.evaluate(x_test, to_categorical(y_data_all.iloc[test]))
#     y_pred = model.predict(x_test)
#     predict_result_hold = id_all.iloc[test]
#     predict_result_hold['label'] = y_data_all.iloc[test]
#     predict_result_hold['0'] = y_pred[:, 0]
#     predict_result_hold['1'] = y_pred[:, 1]
#     predict_result_hold.to_csv(cdu.get_save_path(fName+'_'+str(index)+'.csv'), sep=',', encoding='utf-8')
#     print(acc, loss)


# leave-one-out
lst = []
scaled_data = data_util.scale(x_data_all)
x_data_all = pd.DataFrame(scaled_data, index=x_data_all.index, columns=x_data_all.columns)
for train, test in LeaveOneOut().split(x_data_all):
    y_train = y_data_all.iloc[train]
    model, history = ann(x_data_all.iloc[train], y_train)
    loss, acc = model.evaluate(x_data_all.iloc[test], to_categorical(y_data_all.iloc[test], 2))
    y_pred = model.predict(x_data_all.iloc[test])
    one_reslut = y_pred[0]
    lst.append([id_all.iloc[test].values[0][0], y_data_all.iloc[test].values[0][0], one_reslut[0], one_reslut[1]])
predict_result = pd.DataFrame(lst, columns=['id', 'label', '0', '1'])
predict_result.to_csv(cdu.get_save_path(fName+'.csv'), sep=',', encoding='utf-8')

print('done')
Exemplo n.º 27
0
            output_train = "{}({}: {}) ".format(output_train, i, data[i])

        for i in test:
            bar[i] = "T"
            output_test = "{}({}: {}) ".format(output_test, i, data[i])

        print("[ {} ]".format(" ".join(bar)))
        print("Train: {}".format(output_train))
        print("Test:  {}\n".format(output_test))


P_VAL = 2

data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]])

loocv = LeaveOneOut()
lpocv = LeavePOut(p=P_VAL)
split_loocv = loocv.split(data)
split_lpocv = lpocv.split(data)

print("Data:\n{}\n".format(data))
print("Leave-One-Out:\n")
print_result(split_loocv)
print("Leave-P-Out (where p = {}):\n".format(P_VAL))
print_result(split_lpocv)
'''
Data:
[[1 2]
 [3 4]
 [5 6]
 [7 8]]
Exemplo n.º 28
0
def retrain(hololist=r'trainedmodels/turbulencetraining.xlsx',
            validate=True,
            save=True):
    # retrain SVM model with files in hololist spreadsheet
    # first column has paths of holograms with  "no turbulence" and
    # second column has "turbulence"

    # read training data spreadsheet
    book = xlrd.open_workbook(hololist)
    sheet = book.sheet_by_index(0)
    noturb = sheet.col_values(1)[1:]
    Cnoturb = [0] * len(noturb)
    turb = sheet.col_values(2)[1:]
    Cturb = [1] * len(turb)
    files = noturb + turb
    classes = Cnoturb + Cturb

    # extract features from each hologram  with VGG19
    features = np.zeros((len(files), 1000))
    for i, file in enumerate(files):
        imgpath = holopath(file)  # generate path of hologram image file
        img = image.load_img(imgpath,
                             target_size=(224,
                                          224))  # load hologram image file
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        t = time.time()
        features[i, :] = featuremodel.predict(x)

    # perform validation
    if validate:
        X = features.copy()
        y = np.array(classes)
        loo = LeaveOneOut()  # create leave one out validadtion
        n = 0
        results = np.zeros_like(y)
        yp = np.zeros_like(y)
        # iterate for every hologram in training data
        for train_index, test_index in loo.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            pline = make_pipeline(StandardScaler(), SVC(gamma='auto'))
            pline.fit(X_train, y_train)
            t = time.time()
            y_testp = pline.predict(X_test)
            yp[n] = y_testp
            if (y_testp == y_test):
                results[n] = 1
            n += 1

        # calculate and display performance metrics
        precision = sum(yp[y == 1]) / sum(yp == 1)
        recall = sum(yp[y == 1]) / sum(y == 1)
        accuracy = sum(y == yp) / len(y)
        print('Precision: ' + str(precision))
        print('Recall: ' + str(recall))
        print('Accuracy: ' + str(accuracy))

    # fit SVM with all data
    pline = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    pline.fit(X, y)

    # save SVM for later use
    if save:
        s = pickle.dump(pline, open(r'trainedmodels\turbsvm.p', 'wb'))
        turbdetector = pline
Exemplo n.º 29
0
def balance_tpr(cfg, featdata):
    """
    Find the threshold of class index 0 that yields equal number of true positive samples of each class.
    Currently only available for binary classes.

    Params
    ======
    cfg: config module
    feetdata: feature data computed using compute_features()
    """

    n_jobs = cfg.N_JOBS
    if n_jobs is None:
        n_jobs = mp.cpu_count()
    if n_jobs > 1:
        logger.info('balance_tpr(): Using %d cores' % n_jobs)
        pool = mp.Pool(n_jobs)
        results = []

    # Init a classifier
    selected_classifier = cfg.CLASSIFIER[cfg.CLASSIFIER['selected']]
    if selected_classifier == 'GB':
        cls = GradientBoostingClassifier(loss='deviance', learning_rate=cfg.CLASSIFIER['GB']['learning_rate'],
                                         n_estimators=cfg.CLASSIFIER['GB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['GB']['depth'],
                                         random_state=cfg.CLASSIFIER[selected_classifier]['seed'], max_features='sqrt', verbose=0, warm_start=False,
                                         presort='auto')
    elif selected_classifier == 'XGB':
        cls = XGBClassifier(loss='deviance', learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'],
                                         n_estimators=cfg.CLASSIFIER['XGB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['XGB']['depth'],
                                         random_state=cfg.CLASSIFIER['XGB']['seed'], max_features='sqrt', verbose=0, warm_start=False,
                                         presort='auto')
    elif selected_classifier == 'RF':
        cls = RandomForestClassifier(n_estimators=cfg.CLASSIFIER['RF']['trees'], max_features='auto',
                                     max_depth=cfg.CLASSIFIER['RF']['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER['RF']['seed'],
                                     oob_score=False, class_weight='balanced_subsample')

    elif selected_classifier == 'NN':
        cls = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1,1), random_state=666)    
    elif selected_classifier == 'Ada':
        cls = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
        learning_rate=1.0, n_estimators=100, random_state=0)

    elif selected_classifier == 'LDA':
        cls = LDA()
    elif selected_classifier == 'rLDA':
        cls = rLDA(cfg.CLASSIFIER['rLDA'])
    else:
        logger.error('Unknown classifier type %s' % selected_classifier)
        raise ValueError

    # Setup features
    X_data = featdata['X_data']
    Y_data = featdata['Y_data']
    wlen = featdata['wlen']
    if cfg.CLASSIFIER['PSD']['wlen'] is None:
        cfg.CLASSIFIER['PSD']['wlen'] = wlen

    # Choose CV type
    ntrials, nsamples, fsize = X_data.shape
    selected_CV = cfg.CV_PERFORM[cfg.CV_PERFORM['selected']]
    if cselected_CV == 'LeaveOneOut':
        logger.info_green('\n%d-fold leave-one-out cross-validation' % ntrials)
        if SKLEARN_OLD:
            cv = LeaveOneOut(len(Y_data))
        else:
            cv = LeaveOneOut()
    elif selected_CV == 'StratifiedShuffleSplit':
        logger.info_green('\n%d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_PERFORM[selected_CV]['folds'], cfg.CV_PERFORM[selected_CV]['test_ratio']))
        if SKLEARN_OLD:
            cv = StratifiedShuffleSplit(Y_data[:, 0], cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed'])
        else:
            cv = StratifiedShuffleSplit(n_splits=cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed'])
    else:
        logger.error('%s is not supported yet. Sorry.' % selected_CV)
        raise NotImplementedError
    logger.info('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize))

    # For classifier itself, single core is usually faster
    cls.n_jobs = 1
    Y_preds = []

    if SKLEARN_OLD:
        splits = cv
    else:
        splits = cv.split(X_data, Y_data[:, 0])
    for cnum, (train, test) in enumerate(splits):
        X_train = np.concatenate(X_data[train])
        X_test = np.concatenate(X_data[test])
        Y_train = np.concatenate(Y_data[train])
        Y_test = np.concatenate(Y_data[test])
        if n_jobs > 1:
            results.append(pool.apply_async(get_predict_proba, [cls, X_train, Y_train, X_test, Y_test, cnum+1]))
        else:
            Y_preds.append(get_predict_proba(cls, X_train, Y_train, X_test, Y_test, cnum+1))
        cnum += 1

    # Aggregate predictions
    if n_jobs > 1:
        pool.close()
        pool.join()
        for r in results:
            Y_preds.append(r.get())
    Y_preds = np.concatenate(Y_preds, axis=0)

    # Find threshold for class index 0
    Y_preds = sorted(Y_preds)
    mid_idx = int(len(Y_preds) / 2)
    if len(Y_preds) == 1:
        return 0.5 # should not reach here in normal conditions
    elif len(Y_preds) % 2 == 0:
        thres = Y_preds[mid_idx-1] + (Y_preds[mid_idx] - Y_preds[mid_idx-1]) / 2
    else:
        thres = Y_preds[mid_idx]
    return thres
Exemplo n.º 30
0
 def __init__(self):
     super(CrossValidationLeaveOneOut, self).__init__()
     self.__cv = LeaveOneOut()