Python LeaveOneOut示例，sklearn.model_selection.LeaveOneOut Python示例

示例#1

0

显示文件

文件： test_polyreg_learningCurve.py 项目： bobbydyr/CSE546-Machine-Learning

def generateLearningCurve(X, y, degree, regLambda):
    """
        computing learning curve via leave one out CV
    """

    n = len(X)

    errorTrains = np.zeros((n, n - 1))
    errorTests = np.zeros((n, n - 1))

    loo = model_selection.LeaveOneOut()
    itrial = 0
    for train_index, test_index in loo.split(X):
        #print("TRAIN indices:", train_index, "TEST indices:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        (errTrain, errTest) = learningCurve(X_train, y_train, X_test, y_test,
                                            regLambda, degree)

        errorTrains[itrial, :] = errTrain
        errorTests[itrial, :] = errTest
        itrial = itrial + 1

    errorTrain = errorTrains.mean(axis=0)
    errorTest = errorTests.mean(axis=0)

    plotLearningCurve(errorTrain, errorTest, regLambda, degree)

示例#2

0

显示文件

def leave_out_example():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([0, 0, 2, 2])

    if False:
        lo = model_selection.LeavePOut(p=2)
        print('#splits =', lo.get_n_splits(X))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        lo = model_selection.LeaveOneGroupOut()
        #print('#splits =', lo.get_n_splits(X, y, groups))
        print('#splits =', lo.get_n_splits(groups=groups))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        lo = model_selection.LeaveOneGroupOut(n_groups=2)
        #print('#splits =', lo.get_n_splits(X, y, groups))
        print('#splits =', lo.get_n_splits(groups=groups))
    else:
        lo = model_selection.LeaveOneOut()
        print('#splits =', lo.get_n_splits(X))
    print('Leave-out:', lo)

    #for train_indices, test_indices in lo.split(X, y, groups):
    for train_indices, test_indices in lo.split(X):
        #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape)
        print('TRAIN:', train_indices, 'TEST:', test_indices)

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]

示例#3

0

显示文件

文件： test_model_selection.py 项目： sothic/mla_sani

    def test_split(self):
        X = np.array([1, 2, 3, 4])

        fold1 = model_selection.LeaveOneOut().split(X)
        fold2 = sklearn_model_selection.LeaveOneOut().split(X)

        self.assertFoldEqual(fold1, fold2)

示例#4

0

显示文件

def caseLOO(X, Y, para):
    para = 0
    loo = skmdls.LeaveOneOut()
    N = label_all.shape[1]
    mdl = loo
    X_train, X_test, y_train, y_test = train_test_constructor(N, mdl, X, Y)
    return X_train, X_test, y_train, y_test, N

示例#5

0

显示文件

def cv_LinearRegression_Bias( xM, yV):
    """
    N_it times iteration is performed for cross_validation in order to make further average effect. 
    The flag of 'disp' is truned off so each iteration will not shown.  
    """
    #print( "cv_LinearRegression_None", xM.shape, yV.shape)
    X, y = np.array( xM)[:,0], np.array( yV)[:,0]

    # only 1-dim is allowed for both X and y
    assert (X.ndim == 1) or (X.shape[2] == 1) and (yV.ndim == 1) or (yV.shape[2] == 1)

    loo_c = model_selection.LeaveOneOut()
    loo = loo_c.split( X)

    yP = y.copy()
    for train, test in loo:
        bias = np.mean(y[train] - X[train])
        yP[test] = X[test] + bias

    cv_score_le = np.abs( np.array( y - yP)).tolist()
        
    o_d = {'median_abs_err': np.median( cv_score_le),
           'mean_abs_err': np.mean( cv_score_le),
           'std_abs_err': np.std( cv_score_le), # this can be std(err)
           'list': cv_score_le,
           'ci': "t.b.d",
           'yVp': X.tolist()}
    
    return o_d

示例#6

0

显示文件

文件： logreg_undersampling.py 项目： sagagrawal/Diabetic-Retinopathy-using-Machine-Learning

    def __init__(self):
        f = open(r'C:\Sagar_Agrawal\Projects\ML\Codes\output\Reports\out.csv',
                 encoding='utf-8')
        #  firstrow = f.readline()  # skip the header
        data = np.loadtxt(f, skiprows=1, delimiter=',')

        X = data[:, 1:]  # select columns 1 through end
        y = data[:, 0]  # select column 0, the risk

        imageno = len(y)
        healthy = np.zeros(np.sum(y == 0), np.int)
        diseased = np.zeros(np.sum(y != 1), np.int)

        j = 0
        k = 0
        for i in range(0, imageno, 1):
            if y[i] == 0:
                healthy[j] = i
                j += 1
            else:
                diseased[k] = i
                k += 1

        loo = model_selection.LeaveOneOut()
        self.classify(X, y, healthy, diseased, loo)

示例#7

0

显示文件

文件： technique.py 项目： seanmbayley/prediction_intervals

    def split(self, X, y=None):
        folds = []
        loo = model_selection.LeaveOneOut()

        for train, test in loo.split(X):
            folds.append((train, test))

        return folds

示例#8

0

显示文件

def cv_model(nsplits):
    if nsplits <= 0:
        cv = None
    elif nsplits == 1:
        cv = model_selection.LeaveOneOut()
    else:
        cv = model_selection.KFold(n_splits=nsplits)
    return cv

示例#9

0

显示文件

文件： defect_prediction_details.py 项目： bless2saurabh/AiU2

def do_leave_one_out_cv(model):
    # Leave one out cross validation
    from sklearn import model_selection
    loocv = model_selection.LeaveOneOut()
    results = model_selection.cross_val_score(model,
                                              x_train,
                                              y_train,
                                              cv=loocv)
    for fold in results:
        print("Accuracy: {:.2%}".format(fold))

示例#10

0

显示文件

文件： identification.py 项目： thalesaguiar21/SPyker

def train_and_test(mlp, X, y):
    loo = skselection.LeaveOneOut()
    hits = 0
    for train_index, test_index in loo.split(X):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        mlp.fit(X_train, y_train)
        if mlp.predict(X_test) == y_test:
            hits += 1
    accuracy = hits / X.shape[0] * 100
    return accuracy

示例#11

0

显示文件

def runOneOut():
    num_folds = 5
    num_instances = len(X)
    num_trees = 50
    loocv = model_selection.LeaveOneOut()
    loocv.get_n_splits(X)
    model = SVC()
    results = model_selection.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1)
    #print("LogisticRegression Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0)
    print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))
    print(results.std())

示例#12

0

显示文件

文件： models_evaluation.py 项目： OrionStark/machine-learning

    def leaveOneOutCrossValidationEvaluation(self):
        Model.models[:] = []

        __results = []
        __names = []
        __looCrossValidation = model_selection.LeaveOneOut()
        for name, model in self.__models:
            cv_results = model_selection.cross_val_score(
                model, self.x_train, self.y_train, cv=__looCrossValidation)
            __results.append(cv_results)
            __names.append(name)
            Model.models.append(
                Model(name, cv_results.mean(), cv_results.std()))

        return Model.getHighestScore()[0]

示例#13

0

显示文件

def tune_PLSR(x, y):
    """ Parameter tuning of PLS regression """
    n_comp_range = range(1, int(maxComp))
    param_grid = dict(n_components=n_comp_range)
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    # Leave-one-out cross validation
    cv = model_selection.LeaveOneOut()
    cv.get_n_splits(x)
    # grid search
    grid = model_selection.GridSearchCV(PLSRegression(),
                                        param_grid=param_grid,
                                        scoring=scorer,
                                        cv=cv)
    grid.fit(x, y)
    scores = grid.grid_scores_
    return grid, scores

示例#14

0

显示文件

def cv_detail():
    iris = datasets.load_iris()
    lr = linear_model.LogisticRegression()

    print(
        model_selection.cross_val_score(lr,
                                        iris.data,
                                        iris.target,
                                        cv=model_selection.KFold()))
    # [ 0.  0.  0.]

    print(
        model_selection.cross_val_score(lr,
                                        iris.data,
                                        iris.target,
                                        cv=model_selection.KFold(n_splits=5)))
    # [ 1.          0.93333333  0.43333333  0.96666667  0.43333333]

    print(
        model_selection.cross_val_score(lr,
                                        iris.data,
                                        iris.target,
                                        cv=model_selection.KFold(
                                            shuffle=True, random_state=0)))
    # [ 0.9   0.96  0.96]

    print(
        model_selection.cross_val_score(lr,
                                        iris.data,
                                        iris.target,
                                        cv=model_selection.KFold(
                                            shuffle=True,
                                            random_state=0,
                                            n_splits=5)))
    # [ 0.96666667  0.9         0.96666667  0.96666667  0.93333333]

    # 전체 데이터 개수대로 나누어버림
    # 데이터 개수가 작을 때 사용하는 방법 LeaveOneOut()
    loocv = model_selection.cross_val_score(lr,
                                            iris.data,
                                            iris.target,
                                            cv=model_selection.LeaveOneOut())
    print(loocv)
    print(len(loocv))
    loocv = model_selection.cross_val_score(
        lr, iris.data, iris.target, cv=model_selection.KFold(n_splits=150))
    print(loocv.mean())

示例#15

0

显示文件

文件： main.py 项目： facilisdes/assertions_detection

    def evaluateModel(self, model, features, classes, train_size=0.7):
        XT, XF, YT, YF = model_selection.train_test_split(
            features, classes, train_size)

        kf2 = model_selection.KFold(n_splits=5,
                                    shuffle=True,
                                    random_state=12345)

        # https: // scikit - learn.org / stable / modules / cross_validation.html  # cross-validation
        # https://chrisalbon.com/machine_learning/model_evaluation/cross_validation_parameter_tuning_grid_search/

        # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, по очереди
        kf1 = model_selection.KFold(n_splits=5,
                                    shuffle=False,
                                    random_state=12345)

        # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, случайный порядок
        kf2 = model_selection.KFold(n_splits=5,
                                    shuffle=True,
                                    random_state=12345)

        # Разбивает так, что все тестовые выборки содержат примерно одинаковое количество эл-тов разных классов
        kf3 = model_selection.StratifiedKFold(n_splits=5,
                                              shuffle=False,
                                              random_state=12345)

        # Разбивает в случайном порядке, элементы могут повторяться
        kf4 = model_selection.ShuffleSplit(n_splits=10, random_state=12345)

        # Разбивает в случайном порядке, элементы могут повторяться, тестовые выборки содержат примерно одинаковое количество эл-тов разных классов
        kf5 = model_selection.StratifiedShuffleSplit(n_splits=10,
                                                     random_state=12345)

        # делает N тестовых выборок, содержащих поочередно каждый элемент
        kf6 = model_selection.LeaveOneOut()

        self.trainModel(model, XT, YT)
        YP = self.predictModel(model, XF)

        acc = metrics.accuracy_score(YF, YP)
        prec = metrics.precision_score(YF, YP)
        rec = metrics.recall_score(YF, YP)
        f1 = metrics.f1_score(YF, YP)

        return f1, prec, rec, acc

示例#16

0

显示文件

def MODEL_CV(cv_type="KFold",
             n_splits=N_SPLITS,
             random_state=RANDOM_STATE,
             test_size=TEST_SIZE,
             scoring=SCORING,
             shuffle=SHUFFLE):
    if cv_type == "KFold":
        return model_selection.KFold(n_splits=n_splits,
                                     shuffle=shuffle,
                                     random_state=random_state)
    elif cv_type == "LeaveOneOut":
        return model_selection.LeaveOneOut()
    elif cv_type == "ShuffleSplit":
        return model_selection.ShuffleSplit(n_splits=n_splits,
                                            test_size=test_size,
                                            random_state=random_state)
    else:
        raise Exception()

示例#17

0

显示文件

文件： classifier.py 项目： jianshenlim/Machine-Learning-CAD-predictor

def resultValidation(model_classifier,featureValues,type_label,validator = 1):
    """
    Function estimates the models prediction accuracy by calling a selected cross validation method, returns the KFOLD cross validator by default
    :param model_classifier: Input Classifier
    :param featureValues: List of Feature values
    :param type_label: List of label results
    :return: a string of the model's prediction accuracy
    """
    if not isinstance(validator, int):
        raise Exception("Non Integer Value entered into result validation function")
    elif type(featureValues) != list or type(type_label) != list:
        raise Exception("Non list values added in feature value/type label parameter")
    elif validator<0:
        raise Exception("Invalid validator selected")
    elif len(featureValues) < 10 or len(type_label) < 10:
        raise Exception("Number of samples cannot be less than 10")
    else:
        # K-FOLD CROSS VALIDATION
        if (validator==1):
            kfold = KFold(n_splits=10)  # <- Change split number here
            model_Kfold = model_classifier
            results_Kfold = model_selection.cross_val_score(model_Kfold,featureValues,type_label,cv=kfold)
            return '{0:.2f}'.format(results_Kfold.mean()*100.0)

        # STRATIFIED K-FOLD CROSS VALIDATION
        elif (validator==2):
            skfold = StratifiedKFold(n_splits= 10) # <- Change split number here
            model_SKfold = model_classifier
            results_SKfold = model_selection.cross_val_score(model_SKfold,featureValues,type_label,cv=skfold)
            return '{0:.2f}'.format(results_SKfold.mean()*100.0)

        # LEAVE ONE OUT CROSS VALIDATION (LOOCV)
        elif (validator==3):
            loocv = model_selection.LeaveOneOut()
            model_loocv = model_classifier
            results_loocv = model_selection.cross_val_score(model_loocv,featureValues,type_label,cv=loocv)
            return '{0:.2f}'.format(results_loocv.mean()*100.0)

        # REPEATED RANDOM TEST-TRAIN SPLITS
        else:
            rrtt = model_selection.ShuffleSplit(n_splits=10, test_size=0.30, random_state=100)  # <- Change split number and test_size here
            model_shufflecv = model_classifier
            results_4 = model_selection.cross_val_score(model_shufflecv, featureValues, type_label, cv=rrtt)
            return '{0:.2f}'.format(results_4.mean()*100.0)

示例#18

0

显示文件

文件： MLFrame1.py 项目： Autistic-Soul/VectorFly

def _PARA_GRIDDING( Model, X, y, param_grid, _Scale = False, _CVType = "KFold", n_splits = N_SPLITS, random_state = RANDOM_STATE, scoring = SCORING, test_size = TEST_SIZE ): # Fine!

    if _Scale:
        _Scaler = preprocessing.StandardScaler().fit(X = X)
        X = _Scaler.transform(X = X)

    if _CVType == "KFold":
        _Cross_Val = model_selection.KFold( n_splits = n_splits, random_state = random_state )
    elif _CVType == "LeaveOneOut":
        _Cross_Val = model_selection.LeaveOneOut()
    elif _CVType == "ShuffleSplit":
        _Cross_Val = model_selection.ShuffleSplit( n_splits = n_splits, test_size = test_size, random_state = random_state )
    else:
        raise Exception()

    _Grid = model_selection.GridSearchCV( estimator = Model, param_grid = param_grid, cv = _Cross_Val, scoring = scoring )
    _Grid_Result = _Grid.fit( X = X, y = y )

    return _Grid_Result

示例#19

0

显示文件

def leave_one_out_knn_diversity(images_paths, size, k=3):
    """
    summarize the distance to k closest images in the sets
    :note: put all images into memory so the number of images
    int the set should be reasonable
    :param images_paths: paths to the imgs
    :param size: size of the imgs
    :returns a tuple (mean, std, min, max) of the distances
    """
    loo = model_selection.LeaveOneOut()
    images = decode_images(images_paths, size)
    images = np.reshape(
        images, [len(images), -1
                 ])  # Flatten for sklearn [#samples, #features] framework
    dists = []
    for train_idx, test_idx in tqdm(loo.split(images)):
        train = images[train_idx]
        test = images[test_idx]
        d = knn_diversity_stats(train, test, k)
        dists.append(d)
    return np.average(dists), np.std(dists), np.amin(dists), np.amax(dists)

示例#20

0

显示文件

文件： readData.py 项目： suiningys/graduateProject

def UVECV(xTest, yTest, uveLv):
    # kf = model_selection.KFold(n_splits=5,random_state=10)
    loo = model_selection.LeaveOneOut()
    squareArray = np.array([[]])
    coefs = np.array([[]])
    for train, test in loo.split(xTest):
        xTrainTemp = xTest[train, :]
        yTrainTemp = yTest[train]
        xTestTemp = xTest[test, :]
        yTestTemp = yTest[test]
        yPredictTemp, plsModes = PLS(xTestTemp,yTestTemp,xTrainTemp,yTrainTemp,uveLv)
        coefTemp = plsModes.coef_.T
        if coefs.shape[1]==0:
            coefs = coefTemp
        else:
            coefs = np.append(coefs,coefTemp,axis=0)
        residual = yPredictTemp - yTestTemp
        square = np.dot(residual.T,residual)
        squareArray = np.append(squareArray,square)
        #squareArray.append(square)
    RMSECV = np.sqrt(np.sum(squareArray)/xTest.shape[0])
    return RMSECV,coefs

示例#21

0

显示文件

    def __init__(self):
        f = open("C:\\Users\\Admin\\Desktop\\Final.txt")
        f.readline()  # skip the header
        data = np.loadtxt(f)

        X = data[:, 1:]  # select columns 1 through end
        y = data[:, 0]  # select column 0, the risk

        imageno = len(y)
        healthy = np.zeros(np.sum(y == 0), np.int)
        diseased = np.zeros(np.sum(y == 1), np.int)

        j = 0
        k = 0
        for i in range(0, imageno, 1):
            if y[i] == 0:
                healthy[j] = i
                j += 1
            else:
                diseased[k] = i
                k += 1

        loo = model_selection.LeaveOneOut(imageno)
        self.classify(X, y, healthy, diseased, loo)

示例#22

0

显示文件

文件： MLFrame1.py 项目： Autistic-Soul/VectorFly

def _ALGO_CMP( Models, X, y, _Scale = False, _Plot = False, _CVType = "KFold", n_splits = N_SPLITS, random_state = RANDOM_STATE, scoring = SCORING, test_size = TEST_SIZE ): # Fine!

    _Results = []

    if _Scale:
        _Scaler = preprocessing.StandardScaler().fit(X = X)
        X = _Scaler.transform(X = X)

    if _CVType == "KFold":
        _Cross_Val = model_selection.KFold( n_splits = n_splits, random_state = random_state )
    elif _CVType == "LeaveOneOut":
        _Cross_Val = model_selection.LeaveOneOut()
    elif _CVType == "ShuffleSplit":
        _Cross_Val = model_selection.ShuffleSplit( n_splits = n_splits, test_size = test_size, random_state = random_state )
    else:
        raise Exception()

    for _Each in Models:
        _CVResult = model_selection.cross_val_score( estimator = Models[_Each], X = X, y = y, scoring = SCORING, cv = _Cross_Val )
        _Results.append(( _Each, _CVResult ))

    if _Plot:
        plt.title("Comparison")
        plt.boxplot( x = [ _Results[i][1] for i in range(len(_Results)) ], labels = Models.keys() )
        plt.show()

    _Best_Model = _Results[0]
    for i in range( 1, len(_Results) ):
        if _Results[i][1].mean() > _Best_Model[1].mean():
            _Best_Model = _Results[i]
        elif _Results[i][1].mean() == _Best_Model[1].mean():
            if _Results[i][1].std() > _Best_Model[1].std():
                _Best_Model = _Results[i]
    _Best_Model = Models[_Best_Model[0]]

    return _Results, _Best_Model

示例#23

0

显示文件

文件： svm.py 项目： Jean-Baptiste-Camps/willhelmus

def train_svm(train,
              test,
              leave_one_out=False,
              dim_reduc=None,
              norms=True,
              kernel="LinearSVC",
              final_pred=False):
    """
    Function to train svm
    :param train: train data... (in panda dataframe)
    :param test: test data (itou)
    :param leave_one_out: whether or not to perform leave-one-out cross validation
    :param dim_reduc: dimensionality reduction of input data. Implemented values are pca and som.
    :param norms: perform normalisations, i.e. z-scores and L2 (default True)
    :param kernel: kernel for SVM
    :param final_pred: do the final predictions?
    :return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk:
    confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv
    """

    print(".......... Formatting data ........")
    # Save the classes
    classes = list(train.loc[:, 'author'])
    train = train.drop(['author', 'lang'], axis=1)

    if test is not None:
        classes_test = list(test.loc[:, 'author'])
        test = test.drop(['author', 'lang'], axis=1)
        preds_index = list(test.index)

    nfeats = train.columns.__len__()

    # CREATING PIPELINE
    print(".......... Creating pipeline according to user choices ........")
    estimators = []

    if dim_reduc == 'pca':
        print(".......... using PCA ........")
        estimators.append(('dim_reduc', decomp.PCA()))  # chosen with default
        # wich is: n_components = min(n_samples, n_features)

#    if dim_reduc == 'som':
#        print(".......... using SOM ........")  # TODO: fix SOM
#        som = minisom.MiniSom(20, 20, nfeats, sigma=0.3, learning_rate=0.5)  # initialization of 50x50 SOM
#        # TODO: set robust defaults, and calculate number of columns automatically
#        som.train_random(train.values, 100)
#        # too long to compute
#        # som.quantization_error(train)
#        print(".......... assigning SOM coordinates to texts ........")
#        train = som.quantization(train.values)
#        test = som.quantization(test.values)

    if norms:
        # Z-scores
        # TODO: me suis embeté à implémenter quelque chose qui existe
        # déjà via sklearn.preprocessing.StandardScaler()
        print(".......... using normalisations ........")
        estimators.append(('scaler', preproc.StandardScaler()))
        # scaler = preproc.StandardScaler().fit(train)
        # train = scaler.transform(train)
        # test = scaler.transform(test)
        # feat_stats = pandas.DataFrame(columns=["mean", "std"])
        # feat_stats.loc[:, "mean"] = list(train.mean(axis=0))
        # feat_stats.loc[:, "std"] = list(train.std(axis=0))
        # feat_stats.to_csv("feat_stats.csv")
        #
        # for col in list(train.columns):
        #     if not train[col].sum() == 0:
        #         train[col] = (train[col] - train[col].mean()) / train[col].std()
        #
        # for index, col in enumerate(test.columns):
        #     if not test.iloc[:, index].sum() == 0:
        #         # keep same as train if possible
        #         if not feat_stats.loc[index,"mean"] == 0 and not feat_stats.loc[index,"std"] == 0:
        #             test.iloc[:,index] = (test.iloc[:,index] - feat_stats.loc[index,"mean"]) / feat_stats.loc[index,"std"]
        #
        #         else:
        #             test.iloc[:, index] = (test.iloc[:, index] - test.iloc[:, index].mean()) / test.iloc[:, index].std()

        # NB: je ne refais pas la meme erreur, et cette fois j'utilise le built-in
        # normalisation L2
        # cf. https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer

        estimators.append(('normalizer', preproc.Normalizer()))
        # transformer = preproc.Normalizer().fit(train)
        # train = transformer.transform(train)
        # transformer = preproc.Normalizer().fit(test)
        # test = transformer.transform(test)

    print(".......... choosing SVM ........")
    # let's try a standard one: only with PCA, otherwise too hard
    # if withPca:
    #    classif = sk.SVC(kernel='linear')

    # else:
    # try a faster one
    #    classif = sk.LinearSVC()

    if kernel == "LinearSVC":
        # try a faster one
        estimators.append(('model', sk.LinearSVC()))
        # classif = sk.LinearSVC()

    else:
        estimators.append(('model', sk.SVC(kernel=kernel)))
        # classif = sk.SVC(kernel=kernel)

    print(".......... Creating pipeline with steps ........")
    print(estimators)
    pipe = skp.Pipeline(estimators)

    # Now, doing leave one out validation or training single SVM with train / test split

    if leave_one_out:
        loo = skmodel.LeaveOneOut()
        print(
            ".......... leave-one-out cross validation will be performed ........"
        )
        print(".......... using " + str(loo.get_n_splits(train)) +
              " samples ........")

        # Will need to
        # 1. train a model
        # 2. get prediction
        # 3. compute score: precision, recall, F1 for all categories

        skmodel.cross_val_score(pipe, train, classes, cv=loo)

        # Create the preds array
        preds = np.array([], dtype='<U9')
        for train_index, test_index in loo.split(train):
            # print(test_index)
            pipe.fit(train.iloc[train_index, ],
                     [classes[i] for i in list(train_index)])
            preds = np.concatenate(
                (preds, pipe.predict(train.iloc[test_index, ])))

        # and now, leave one out evaluation (very small redundancy here, one line that could be stored elsewhere)
        unique_labels = list(set(classes))
        pandas.DataFrame(metrics.confusion_matrix(classes,
                                                  preds,
                                                  labels=unique_labels),
                         index=['true:{:}'.format(x) for x in unique_labels],
                         columns=['pred:{:}'.format(x) for x in unique_labels
                                  ]).to_csv("confusion_matrix.csv")

        print(metrics.classification_report(classes, preds))
        # writing misattributions
        pandas.DataFrame(
            [
                i for i in zip(list(train.index), list(classes), list(preds))
                if i[1] != i[2]
            ],
            columns=["id", "True",
                     "Pred"]).set_index('id').to_csv("misattributions.csv")

        # and now making the model for final preds after leave one out if necessary
        if final_pred:
            print(".......... Training final SVM with all train set ........")
            pipe.fit(train, classes)
            preds = pipe.predict(test)
            pandas.DataFrame(data={
                'filename': preds_index,
                'author': list(preds)
            }).to_csv("FINAL_PREDICTIONS.csv")

    # And now the simple case where there is only one svm to train
    else:
        pipe.fit(train, classes)
        preds = pipe.predict(test)
        # and evaluate
        unique_labels = list(set(classes + classes_test))

        pandas.DataFrame(metrics.confusion_matrix(classes_test,
                                                  preds,
                                                  labels=unique_labels),
                         index=['true:{:}'.format(x) for x in unique_labels],
                         columns=['pred:{:}'.format(x) for x in unique_labels
                                  ]).to_csv("confusion_matrix.csv")

        print(metrics.classification_report(classes_test, preds))

    # AND NOW, we need to evaluate or create the final predictions
    if final_pred:
        pandas.DataFrame(data={
            'filename': preds_index,
            'author': list(preds)
        }).to_csv("FINAL_PREDICTIONS.csv")

    return pipe

示例#24

0

显示文件

 def setup_indices(self, train_data, test_data):
     splitter = skl.LeaveOneOut()
     splitter.get_n_splits(test_data)
     self.indices = list(splitter.split(test_data))

示例#25

0

显示文件

文件： logistic_regression.py 项目： Vaseekaran-V/SDGP

X_test = sc.transform(X_test)  #scale X_test

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=0.00018263636363636363,
                                penalty='none',
                                solver='sag')  #cv = leave one out
#classifier = LogisticRegression(C = 1e-06, penalty = 'none', solver =  'sag') #cv = 10
#classifier = LogisticRegression(C =  8.172727272727273e-05, penalty = 'l2', solver =  'sag') #cv = 5

classifier.fit(X, y)

y_pred = classifier.predict(X_test)

#Model evaluation
leave_one_out = model_selection.LeaveOneOut()
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=leave_one_out)
print("Training Accuracy = {:.2f} %".format(accuracies.mean() * 100))

cm = confusion_matrix(y_test, y_pred)  #(real values, predicted values)
print("Testing accuracy = {:.2f} %".format(
    accuracy_score(y_test, y_pred) * 100))

print("TN =", cm[0][0], "TP =", cm[1][1])
print("FP =", cm[0][1], "FN =", cm[1][0])

joblib.dump(classifier, 'model_logistic_regresssion.pkl')

示例#26

0

显示文件

文件： changgung_group_SVM_dynamic_multiple_repeat_paralell.py 项目： geyunxiang/mmdps_script

def func(args):
    # input parameters
    atlasobj = args[0]
    ChanggungPatientNets = args[1]

    # return results
    ret = [None, None, None, None,
           None]  # discover rate, accuracy, precision, recall, specificity

    ChanggungHealthyNets = io_utils.loadRandomDynamicNets(
        ChanggungAllFullPath,
        atlasobj,
        totalNum=len(ChanggungPatientNets),
        scanList=os.path.join(ChanggungRootPath, 'normal_scans.txt'))
    sig_connections = stats_utils.filter_sigdiff_connections(
        ChanggungPatientNets, ChanggungHealthyNets)
    ret[0] = float(len(sig_connections)) / (atlasobj.count *
                                            (atlasobj.count - 1) / 2.0)
    X1 = np.zeros((len(ChanggungHealthyNets), 1))  # healthy
    y1 = -1 * np.ones((len(ChanggungHealthyNets), 1))
    X2 = np.zeros((len(ChanggungPatientNets), 1))  # patient
    y2 = np.ones((len(ChanggungPatientNets), 1))
    for c in sig_connections:
        normalCList = result_utils.getAllFCAtIdx(c[0], c[1],
                                                 ChanggungHealthyNets)
        X1 = np.insert(X1, 0, normalCList, axis=1)
        patientCList = result_utils.getAllFCAtIdx(c[0], c[1],
                                                  ChanggungPatientNets)
        X2 = np.insert(X2, 0, patientCList, axis=1)
    X = np.concatenate([X1[:, :-1], X2[:, :-1]])
    y = np.concatenate((y1, y2)).ravel()

    # classifier
    classifier = svm.SVC(kernel='linear')

    # leave one out cross validation
    accuracy = []
    truePositive = 0
    falsePositive = 0
    trueNegative = 0
    falseNegative = 0
    loo = model_selection.LeaveOneOut()
    for trainIdx, testIdx in loo.split(X, y):
        classifier.fit(X[trainIdx, :], y[trainIdx])
        accuracy.append(classifier.score(X[testIdx, :], y[testIdx]))
        p = classifier.predict(X[testIdx, :])[0]
        if p == 1 and y[testIdx] == 1:
            truePositive += 1
        elif p == -1 and y[testIdx] == -1:
            trueNegative += 1
        elif p == 1 and y[testIdx] == -1:
            falsePositive += 1
        else:
            falseNegative += 1
    ret[1] = np.mean(accuracy)

    precision = float(truePositive) / (truePositive + falsePositive)
    recall = float(truePositive) / (truePositive + falseNegative)
    specificity = float(trueNegative) / (trueNegative + falsePositive)
    ret[2] = precision
    ret[3] = recall
    ret[4] = specificity
    return ret

示例#27

0

显示文件

文件： FeatureSelectionV3.py 项目： andywood/PyForecast

    def SFBS(self):
        """ Set the regression scheme """
        if self.objFunction == 'MLR':
            self.ObjFunctionRun = MultipleRegression

        elif self.objFunction == 'PCAR':
            self.ObjFunctionRun = PrincipalComponentsRegression

        elif self.objFunction == 'ZSCR':
            self.ObjFunctionRun = ZScoreRegression

        elif self.objFunction == 'ANN':
            self.ObjFunctionRun = NeuralNetwork
        """ Set the Cross validation type """
        if self.crossVal == 'Leave One Out':
            self.cv = model_selection.LeaveOneOut()
        elif self.crossVal == 'K-Fold (5 folds)':
            self.cv = model_selection.KFold(n_splits=5)
        else:
            self.cv = model_selection.KFold(n_splits=10)
        """ Get the predictand Data"""
        self.predictandData = pd.DataFrame().from_dict(
            self.equationDict['Predictand']['Data'], orient='columns')
        self.predictandData.columns = ['Predictand']
        """ Initialize data for predictors """
        self.predictorData = pd.DataFrame()
        for predictorName in self.predictorDict:
            for interval in self.predictorDict[predictorName]:
                if self.predictorDict[predictorName][interval][
                        'prdID'] in list(self.equationDict['PredictorPool']):
                    self.predictorData = pd.concat([
                        self.predictorData,
                        pd.DataFrame().from_dict(
                            self.predictorDict[predictorName][interval]
                            ['Data'],
                            orient='columns')
                    ],
                                                   axis=1)
        self.predictorDataNames = list(self.predictorData.columns)
        """ Initialize a list of dictionarys to store model information """
        self.searchDictList = [{
            "fcstID": "",
            "Type": "Linear - {0}".format(self.objFunction),
            "Coef": [],
            "prdIDs": self.predictorDataNames,
            "Intercept": [],
            "PrincCompData": {},
            "Metrics": {
                "Cross Validated Adjusted R2": -1e4,
                "Root Mean Squared Prediction Error": 1e5,
                "Cross Validated Nash-Sutcliffe": -1e4,
                "Adjusted R2": -1e4,
                "Root Mean Squared Error": 1e5,
                "Nash-Sutcliffe": -1e4,
                "Sample Variance": 1e5
            },
            "CrossValidation": self.crossVal,
            "Forecasted": "",
            "CV_Forecasted": "",
            "Years Used": [],
            "FeatSelectionProgress": "Running"
        } for n in range(self.numModels)]
        """ Begin a loop to iterate through parallized floating selection """
        iterCounter = 0
        modelsAnalyzed = 0
        modelsCompleted = 0
        """ Array to store current models """
        currentModels = [
            self.predictorDataNames for i in range(self.numModels)
        ]
        """ Set up a multiprocessing pool """
        pool = ThreadPool(processes=CPUCount() - 1)

        while iterCounter < 1000:
            iterCounter = iterCounter + 1
            print('iteration: ', iterCounter)
            input("continue with next iteration...")
            """ Iterate through each model and perform 1 iteration of Sequential Floating Selection """
            for i in range(self.numModels):
                input()
                """ Check to see if this model has completed yet"""
                if self.searchDictList[i][
                        'FeatSelectionProgress'] == 'Completed':
                    continue
                """ Set some variables for this iteration """
                modelChanged = False
                currentPredictorSet = self.searchDictList[i]['prdIDs']
                predictorsToBeRemoved = currentPredictorSet
                print("""
Model Number: {0}
current predictor set: {1}
predictors to try and remove: {2}
                """.format(i, currentPredictorSet, predictorsToBeRemoved))

                results = list(
                    map(testPredictorSet, [
                        list(l) for l in zip(
                            repeat(currentPredictorSet), predictorsToBeRemoved,
                            repeat('Remove'), repeat(currentModels),
                            repeat(self.cv), repeat(self.perfMetric),
                            repeat(self.predictorData),
                            repeat(self.predictandData),
                            repeat(self.ObjFunctionRun), repeat(pool))
                    ]))
                """ Determine if any of the removals increased model performance """
                for result in results:
                    print("")
                    input()
                    print("""
                    )
    We tried removing predictor: {0}
    The new metrics are: {1}
    the new predictor set is: {2}
                    """.format(
                        list(
                            set(currentPredictorSet) -
                            set(result[0]['prdID'])), result[1],
                        result[0]['prdID']))
                    if result[0]['prdID'] == ['000'] or result[0]['prdID'] == [
                            '-1000'
                    ]:
                        continue

                    if Metrics.metricBetterThan(
                            newMetric=result[1][self.perfMetric],
                            oldMetric=self.searchDictList[i]['Metrics'][
                                self.perfMetric],
                            perfMeasure=self.perfMetric):
                        predictorRemoved = list(
                            set(currentPredictorSet) - set(result[0]['prdID']))
                        self.searchDictList[i]['Metrics'] = result[1]
                        self.searchDictList[i]['prdIDs'] = result[0]['prdID']
                        self.searchDictList[i]['Forecasted'] = result[2][
                            'Forecasted']
                        self.searchDictList[i]['CV_Forecasted'] = result[2][
                            'CV_Forecasted']
                        self.searchDictList[i]['Coef'] = result[3]
                        self.searchDictList[i]['Intercept'] = result[4]
                        self.searchDictList[i]['PrincCompData'] = result[5]
                        currentModels[i] = result[0]['prdID']
                        modelChanged = True

                    modelsAnalyzed = modelsAnalyzed + 1

                    self.signals.updateRunLabel.emit(
                        "Models Analyzed: {0}".format(modelsAnalyzed))
                    """ If we didn't remove a predictor, attempt to skip a step and try removing 2 predictors """
                    # if modelChanged == False:
                    #     predictorsToBeRemoved = list(combinations(currentPredictorSet, 2))
                    #     results = list(map(testPredictorSet, [list(l) for l in zip( repeat(currentPredictorSet),
                    #                                                                 predictorsToBeRemoved,
                    #                                                                 repeat('Remove'),
                    #                                                                 repeat(currentModels),
                    #                                                                 repeat(self.cv),
                    #                                                                 repeat(self.perfMetric),
                    #                                                                 repeat(self.predictorData),
                    #                                                                 repeat(self.predictandData),
                    #                                                                 repeat(self.ObjFunctionRun),
                    #                                                                 repeat(pool))]))
                    #     for result in results:

                    #         if result[0]['prdID'] == '000':
                    #             continue

                    #         if Metrics.metricBetterThan( newMetric = result[1][self.perfMetric], oldMetric = self.searchDictList[i]['Metrics'][self.perfMetric], perfMeasure = self.perfMetric):
                    #             predictorRemoved = list(set(currentPredictorSet) - set(result[0]['prdID']) )
                    #             self.searchDictList[i]['Metrics'] = result[1]
                    #             self.searchDictList[i]['prdIDs'] = result[0]['prdID']
                    #             self.searchDictList[i]['Forecasted'] = result[2]['Forecasted']
                    #             self.searchDictList[i]['CV_Forecasted'] = result[2]['CV_Forecasted']
                    #             self.searchDictList[i]['Coef'] = result[3]
                    #             self.searchDictList[i]['Intercept'] = result[4]
                    #             self.searchDictList[i]['PrincCompData'] = result[5]
                    #             currentModels[i] = result[0]['prdID']
                    #             modelChanged = True

                    #         modelsAnalyzed = modelsAnalyzed + 1

                    #         self.signals.updateRunLabel.emit("Models Analyzed: {0}".format(modelsAnalyzed))
                """ Try and add a variable back in, but don't add in a predictor we just removed """
                currentPredictorSet = self.searchDictList[i]['prdIDs']
                if modelChanged == True:
                    predictorsToBeAdded = list(
                        set([
                            prd for prd in self.predictorDataNames
                            if prd not in currentPredictorSet
                        ]) - set(predictorRemoved))
                else:
                    predictorsToBeAdded = [
                        prd for prd in self.predictorDataNames
                        if prd not in currentPredictorSet
                    ]

                results = list(
                    map(testPredictorSet, [
                        list(l) for l in zip(
                            repeat(currentPredictorSet), predictorsToBeAdded,
                            repeat('Add'), repeat(currentModels),
                            repeat(self.cv), repeat(self.perfMetric),
                            repeat(self.predictorData),
                            repeat(self.predictandData),
                            repeat(self.ObjFunctionRun), repeat(pool))
                    ]))
                """ Determine if any of the additions increased model performance """
                for result in results:

                    if result[0]['prdID'] == ['000']:
                        continue

                    if Metrics.metricBetterThan(
                            newMetric=result[1][self.perfMetric],
                            oldMetric=self.searchDictList[i]['Metrics'][
                                self.perfMetric],
                            perfMeasure=self.perfMetric):
                        predictorRemoved = list(
                            set(currentPredictorSet) - set(result[0]['prdID']))
                        self.searchDictList[i]['Metrics'] = result[1]
                        self.searchDictList[i]['prdIDs'] = result[0]['prdID']
                        self.searchDictList[i]['Forecasted'] = result[2][
                            'Forecasted']
                        self.searchDictList[i]['CV_Forecasted'] = result[2][
                            'CV_Forecasted']
                        self.searchDictList[i]['Coef'] = result[3]
                        self.searchDictList[i]['Intercept'] = result[4]
                        self.searchDictList[i]['PrincCompData'] = result[5]
                        currentModels[i] = result[0]['prdID']
                        modelChanged = True

                    modelsAnalyzed = modelsAnalyzed + 1

                    self.signals.updateRunLabel.emit(
                        "Models Analyzed: {0}".format(modelsAnalyzed))
                    """ If the model hasn't changed, complete the model and update the progress bar """
                    if modelChanged == False and currentPredictorSet != []:
                        self.searchDictList[i][
                            'FeatSelectionProgress'] = 'Completed'
                        modelsCompleted = modelsCompleted + 1
                        self.signals.updateProgBar.emit(
                            int(100 * modelsCompleted / self.numModels))

            for i in range(len(self.searchDictList)):
                if self.searchDictList[i]['prdIDs'] == []:
                    fcstID = 'EMPTY'
                else:
                    fcstID = encryptions.generateFcstID(
                        self.searchDictList[i]['Type'],
                        self.searchDictList[i]['prdIDs'])
                self.searchDictList[i]['fcstID'] = fcstID

        pool.close()
        pool.join()

        self.signals.returnFcstDict.emit(self.searchDictList)

示例#28

0

显示文件

文件： ex7_1_2.py 项目： Mahbub20/Angiographic-Heart-Disease-Medical-Patient-Prediction-using-Machine-Learning-techniques

# exercise 7.1.2

from matplotlib.pyplot import figure, plot, xlabel, ylabel, show
import numpy as np
from scipy.io import loadmat
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection

# requires data from exercise 1.5.1
from ex1_5_1 import *

# Maximum number of neighbors
L = 40

CV = model_selection.LeaveOneOut()
errors = np.zeros((N, L))
i = 0
for train_index, test_index in CV.split(X, y):
    print('Crossvalidation fold: {0}/{1}'.format(i + 1, N))

    # extract training and test set for current CV fold
    X_train = X[train_index, :]
    y_train = y[train_index]
    X_test = X[test_index, :]
    y_test = y[test_index]

    # Fit classifier and classify the test points (consider 1 to 40 neighbors)
    for l in range(1, L + 1):
        knclassifier = KNeighborsClassifier(n_neighbors=l)
        knclassifier.fit(X_train, y_train)
        y_est = knclassifier.predict(X_test)

示例#29

0

显示文件

文件： moreno_2011.py 项目： MiguelCastro1/clardia---Type-2-Diabetes-Prediction-Using-Short-PPG-Signals-and-Physiological-Characteristics-

def loo_regression(standardized_X, Y):
    'Leave one out regression and provide the mean absolute error for train test'

    # convert to numpy arrays
    Y = np.array(Y)
    standardized_X = np.array(standardized_X)

    loocv = model_selection.LeaveOneOut()

    reg1 = LinearRegression()
    reg2 = Ridge()
    print(reg2)
    reg3 = Lasso()
    print(reg3)
    reg4 = ElasticNet()
    print(reg4)
    reg5 = xgboost.XGBRegressor(max_depth=3,reg_lambda=1,reg_alpha=1)
    print(reg5)
    reg7 = RandomForestRegressor(n_estimators=10,max_depth=3) #, max_features=10
    print(reg7)
    #reg8 = SVR(kernel='poly')

    regs = [reg1, reg2, reg3, reg4, reg5, reg7]
    classifier_name = ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'XGB','RF']
    table = []

    for i in range(0, len(classifier_name)):
        print('\n')
        print('Classifier: ', classifier_name[i])
        reg = regs[i]
        loop = 0
        temp = []

        temp_summary = []
        temp_train_err = []
        temp_test_err = []

        for train_index, test_index in loocv.split(standardized_X):
            loop = loop + 1
            # print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = standardized_X[train_index], standardized_X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            # train_accuracy.append(reg.score(X_train, y_train))

            reg.fit(X_train, y_train)

            y_train_predict = reg.predict(X_train)
            train_error = mean_absolute_error(y_train, y_train_predict)

            y_test_predict = reg.predict(X_test)
            test_error = mean_absolute_error(y_test, y_test_predict)
            temp.append([ y_test[0],round(y_test_predict[0],2) ])

            temp_train_err.append(train_error)
            temp_test_err.append(test_error)

        train_mean = round(np.mean(temp_train_err),2)
        train_std  = round(np.std(temp_train_err),2)
        test_mean  = round(np.mean(temp_test_err),2)
        test_std   = round(np.std(temp_test_err),2)
        temp_summary.extend([classifier_name[i],train_mean, train_std, test_mean, test_std])
        table.append(temp_summary)

        temp = pd.DataFrame(temp)
        filename3 = 'output/moreno_corrupt/Val_Result_' + classifier_name[i] + '.csv'
        temp.to_csv(filename3,header=False, index=False)

    table = pd.DataFrame(table)
    table.to_csv('output/moreno_corrupt/result_summary.csv',header=False, index=False)
    print(table)

示例#30

0

显示文件

文件： findcorrelations.py 项目： santiama/molSimplify

def correlation_supervisor(path,
                           rootdir,
                           simple=False,
                           lig_only=False,
                           max_descriptors=False):
    # load the files from the given input file
    file_dict, fail_dict = accquire_file(path)
    #loop over sucessful imports to get descriptors:
    big_mat = list()
    col_names = list()
    for i, keyv in enumerate(file_dict.keys()):
        file_dict[keyv].get_descriptor_vector(lig_only,
                                              simple,
                                              name=False,
                                              loud=False)
        #print('i = ',str(i))
        if i == 0:
            col_names = file_dict[keyv].descriptor_names
        # reorganize the data
        this_row = list()
        this_row.append(float(file_dict[keyv].yvalue))
        this_row.extend(file_dict[keyv].descriptors)
        big_mat.append(this_row)
    big_mat = np.array(big_mat)
    ##### let's do some regression
    ### standardize model:
    col_array = np.array(col_names)
    print('length of col array is  ' + str(len(col_array)))
    n_tot = len(col_array)
    X = big_mat[:, 1:]
    print('dimension of data matrix is ' + str(big_mat.shape))
    n_obs = len(X[:, 1])
    Scaler = preprocessing.StandardScaler().fit(X)
    Xs = Scaler.transform(X)
    Y = big_mat[:, 0]
    ## find baseline model (all descriptors)
    Reg = linear_model.LinearRegression()
    Reg.fit(Xs, Y)
    Ypred_all_all = Reg.predict(Xs)
    rs_all_all = metrics.r2_score(Y, Ypred_all_all)
    loo = model_selection.LeaveOneOut()
    r_reduce = list()
    mse_reduce = list()
    ### stepwise reduce the feature set until only one is left
    for n in range(0, n_tot):
        reductor = feature_selection.RFE(Reg, n_tot - n, step=1, verbose=0)
        reductor.fit(Xs, Y)
        Ypred_all = reductor.predict(Xs)
        rs_all = metrics.r2_score(Y, Ypred_all)
        mse_all = metrics.mean_squared_error(Y, Ypred_all)
    r_reduce.append(rs_all)
    mse_reduce.append(mse_all)
    ### reduce to one feature

    reductor_features = list()
    for i, ranks in enumerate(reductor.ranking_):
        reductor_features.append([col_array[i], ranks])
    reductor_features = sorted(reductor_features, key=lambda x: x[1])
    #print(reductor_features)
    print('****************************************')
    ### select best number using cv
    selector = feature_selection.RFECV(Reg,
                                       step=1,
                                       cv=loo,
                                       verbose=0,
                                       scoring='neg_mean_squared_error')
    selector.fit(Xs, Y)
    select_mse = selector.grid_scores_
    Ypred = selector.predict(Xs)
    rs = metrics.r2_score(Y, Ypred)
    n_opt = selector.n_features_
    opt_features = col_array[selector.support_]
    ranked_features = list()
    for i, ranks in enumerate(selector.ranking_):
        ranked_features.append([col_array[i], ranks])
    ranked_features = sorted(ranked_features, key=lambda x: x[1])
    print(ranked_features)
    if max_descriptors:  ## check if we need to reduce further
        print('a max of ' + str(max_descriptors) + ' were requested')
        n_max = int(max_descriptors)
        if n_opt > n_max:
            print('the RFE process selected ' + str(n_opt) +
                  ' varibles as optimal')
            print('discarding an additional ' + str(n_max - n_opt) +
                  ' variables')
            new_variables = list()
            new_mask = np.zeros(n_tot)
            for i in range(0, n_max):
                new_variables.append(ranked_features[i])
    ## report results to user
    print('analzyed ' + str(n_obs) + ' molecules')
    print('the full-space R2 is  ' + str("%0.2f" % rs_all_all) + ' with ' +
          str(n_tot) + ' features')
    print('optimal number of features is ' + str(n_opt) + ' of total ' +
          str(n_tot))
    print('the opt R2 is  ' + str("%0.2f" % rs))

    #print(ranked_features)
    X_r = selector.transform(Xs)
    reg_red = linear_model.LinearRegression()
    reg_red.fit(X_r, Y)
    Ypred_r = reg_red.predict(X_r)
    errors = [Y[i] - Ypred_r[i] for i in range(0, n_obs)]
    coefs = reg_red.coef_
    intercept = reg_red.intercept_
    mse_all = metrics.mean_squared_error(Y, Ypred_all_all)
    mse_r = metrics.mean_squared_error(Y, Ypred_r)
    if n_opt < 30:
        print('the optimal variables are: ' + str(opt_features))
        print('the coefficients are' + str(coefs))
    else:
        print('the (first 30) optimal variables are: ' +
              str(opt_features[0:29]))
        print('the (first 30) coefficients are' + str(coefs[0:29]))
    print('the intercept is ' + str("%0.2f" % intercept))
    print('the  training MSE with the best feature set is ' +
          str("%0.2f" % mse_r))
    print('the MSE  with all features  is ' + str("%0.2f" % mse_all))
    print('by eliminating ' + str(n_tot - n_opt) + ' features,' +
          ' CV-prediction MSE decreased from ' +
          str("%0.0f" % abs(select_mse[0])) + ' to ' +
          str("%00f" % abs(select_mse[n_tot - n_opt])))
    with open(rootdir + 'RFECV_rankings.csv', 'w') as f:
        f.write('RFE_rank,RFE_col,RFECV_rank,RFECV_col, \n')
        for i, items in enumerate(reductor_features):
            f.write(
                str(items[0]) + ',' + str(items[1]) + ',' +
                str(ranked_features[i][0]) + ',' + str(ranked_features[i][1]) +
                '\n')
    with open(rootdir + 'y_data.csv', 'w') as f:
        for items in Y:
            f.write(str(items) + '\n')
    with open(rootdir + 'y_pred_r.csv', 'w') as f:
        for items in Ypred_r:
            f.write(str(items) + '\n')
    with open(rootdir + 'optimal_decriptor_space.csv', 'w') as f:
        for i in range(0, n_obs):
            for j in range(0, n_opt):
                if j == (n_opt - 1):
                    f.write(str(X_r[i][j]) + '\n')
                else:
                    f.write(str(X_r[i][j]) + ',')
    with open(rootdir + 'full_descriptor_space.csv', 'w') as f:
        for names in col_names:
            f.write(names + ',')
        f.write('\n')
        for i in range(0, n_obs):
            for j in range(0, n_tot):
                if j == (n_tot - 1):
                    f.write(str(Xs[i][j]) + '\n')
                else:
                    f.write(str(Xs[i][j]) + ',')
    with open(rootdir + 'scaling.csv', 'w') as f:
        means = Scaler.mean_
        var = Scaler.var_
        f.write('name, mean,variance \n')
        for i in range(0, n_tot):
            f.write(
                str(col_names[i]) + ',' + str(means[i]) + ',' + str(var[i]) +
                ',' + str(selector.ranking_[i]) + '\n')
    with open(rootdir + 'coeficients.csv', 'w') as f:
        f.write('intercept,' + str(intercept) + '\n')
        for i in range(0, n_opt):
            f.write(str(opt_features[i]) + ',' + str(coefs[i]) + '\n')
    with open(rootdir + 'rfe_mse.csv', 'w') as f:
        f.write('features removed,mean CV error,' + str(intercept) + '\n')
        count = 0
        for items in mse_reduce:
            f.write(str(count) + ',' + str(items) + '\n')
            count += 1