예제 #1
0
def shuffle_split():
    iris = datasets.load_iris()

    sp1 = model_selection.ShuffleSplit(train_size=0.6,
                                       test_size=0.4,
                                       n_splits=3)
    for train_index, test_index in sp1.split(iris.data, iris.target):
        print(len(train_index), len(test_index))
    print()

    sp2 = model_selection.ShuffleSplit(train_size=0.6, n_splits=3)
    for train_index, test_index in sp2.split(iris.data, iris.target):
        print(len(train_index), len(test_index))
    print()

    sp3 = model_selection.ShuffleSplit(test_size=0.4, n_splits=3)
    for train_index, test_index in sp3.split(iris.data, iris.target):
        print(len(train_index), len(test_index))
    print()

    total = []
    sp4 = model_selection.ShuffleSplit(train_size=100,
                                       test_size=50,
                                       n_splits=3)
    for train_index, test_index in sp4.split(iris.data, iris.target):
        print(len(test_index), test_index[:10])
        total += list(test_index)

    print(sorted(total))
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 5
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=15,
                                               random_state=seed,
                                               n_estimators=2500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
예제 #3
0
def test_classifier(clf):
    #iterate of test/train data set for n interattion and apply the decision tree regressor algorithm
    shuffle_validator = ms.ShuffleSplit(n_splits=10,
                                        test_size=0.2,
                                        random_state=0)
    scores = ms.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))
def apply_svm_cross_validation(
    X,
    y,
    svc_args={
        'loss': 'hinge',
        'penalty': 'elasticnet',
        'max_iter': 1000,
        'alpha': 0.001,
        'tol': 1e-3,
        'random_state': 123456,
        'class_weight': None
    }):
    clf = linear_model.SGDClassifier(**svc_args)
    cv = model_selection.ShuffleSplit(n_splits=10,
                                      test_size=0.1,
                                      random_state=123456)
    scores = model_selection.cross_validate(
        clf,
        X,
        y,
        cv=cv,
        scoring=['precision', 'recall', 'f1'],
        return_train_score=True)
    print(scores)
    return [
        np.mean(scores['test_precision']),
        np.mean(scores['test_recall']),
        np.mean(scores['test_f1'])
    ]
예제 #5
0
 def __call__(self, table):
     if self.replace:
         # pylint: disable=no-member
         rgen = np.random.RandomState(self.random_state)
         sample = rgen.randint(0, len(table), self.n)
         o = np.ones(len(table))
         o[sample] = 0
         others = np.nonzero(o)[0]
         return others, sample
     if self.n == len(table):
         rgen = np.random.RandomState(self.random_state)
         sample = np.arange(self.n)
         rgen.shuffle(sample)
         return np.array([], dtype=int), sample
     elif self.stratified and table.domain.has_discrete_class:
         test_size = max(len(table.domain.class_var.values), self.n)
         splitter = skl.StratifiedShuffleSplit(
             n_splits=1,
             test_size=test_size,
             train_size=len(table) - test_size,
             random_state=self.random_state)
         splitter.get_n_splits(table.X, table.Y)
         ind = splitter.split(table.X, table.Y)
     else:
         splitter = skl.ShuffleSplit(n_splits=1,
                                     test_size=self.n,
                                     random_state=self.random_state)
         splitter.get_n_splits(table)
         ind = splitter.split(table)
     return next(iter(ind))
    def cv_models(self):
        nfold = 3
        cv_split = model_selection.ShuffleSplit(n_splits=nfold,
                                                test_size=0.3,
                                                train_size=0.7,
                                                random_state=43)
        index = 0
        for model_name, model in self.models.items():
            # use muiti processing to speed up this process
            print(" ---> Work on CV for %s " % model_name)

            start = time.time()
            rmse = np.sqrt(-cross_val_score(model,
                                            self.train_x.values,
                                            self.train_y,
                                            scoring='neg_mean_squared_error',
                                            cv=cv_split,
                                            n_jobs=5))
            ##cv = cv_split))
            print(rmse)
            end = time.time()
            print("  time spent: ", end - start)

            self.MLA.loc[index, 'CVScoreMean'] = rmse.mean()
            self.MLA.loc[index, 'CVScoreSTD'] = rmse.std()
            index += 1
        print(self.MLA)
def compare_classifiers(X, y):
    # I initialise a shuffle split class to split our dataset 10 times
    # and each batch will include 90% of the dataset.
    # This makes the classifiers more robust since we'll 'rotate' the
    # training data multiple times.
    shuffle_split_class = model_selection.ShuffleSplit(n_splits=10,
                                                       test_size=0.3,
                                                       train_size=0.6,
                                                       random_state=0)

    classifier_comparison = []
    for classifier in classifiers:
        cross_validation = model_selection.cross_validate(
            classifier, X, y, cv=shuffle_split_class, return_train_score=True)

        classifier_output = {
            'Name': classifier.__class__.__name__,
            'Train Accuracy Mean': cross_validation['train_score'].mean(),
            'Dev Accuracy Mean': cross_validation['test_score'].mean(),
            'Dev Accuracy 3*STD': cross_validation['test_score'].std() * 3,
            'Time': cross_validation['fit_time'].mean(),
        }

        classifier_comparison.append(classifier_output)

    classifier_comparison = pd.DataFrame(
        classifier_comparison, columns=classifier_comparison[0].keys())
    classifier_comparison.sort_values(by=['Dev Accuracy Mean'],
                                      ascending=False,
                                      inplace=True)
    return classifier_comparison
예제 #8
0
def predict_age(dataset):
    '''
    预测年龄
    '''
    data_p = dataset[['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']]
    x_train = data_p.loc[~data_p['Age'].isnull(), :].drop('Age', 1)
    y_train = data_p.loc[~data_p['Age'].isnull(), :]['Age']
    x_test = data_p.loc[data_p['Age'].isnull(), :].drop('Age', 1)
    print('初步处理完')
    param_grid = {
        'learning_rate': [.001, .005, .01, .05, .1],
        'max_depth': [2, 4, 6, 8],
        'n_estimators': [50, 100, 300, 500, 1000],
        'seed': [2018]
    }
    cv_split = model_selection.ShuffleSplit(n_splits=10,
                                            test_size=.3,
                                            train_size=.6,
                                            random_state=0)
    tune_model = model_selection.GridSearchCV(XGBRegressor(nthread=-1),
                                              param_grid=param_grid,
                                              scoring='neg_mean_squared_error',
                                              cv=cv_split)
    print('model tuned')
    tune_model.fit(x_train, y_train)
    print('model fitted')
    print(tune_model.best_params_)
    y_test = tune_model.best_estimator_.predict(x_test)
    print('model predicted')
    print(y_test.head(5))
    return y_test
def apply_logreg_cross_validation_coeff(
    X,
    y,
    svc_args={
        'penalty': 'l2',
        'C': 1.0,
        'random_state': 123456,
        'multi_class': "auto",
        'class_weight': None,
        'solver': "lbfgs",
        'max_iter': 1000,
        'verbose': 1
    }):
    clf = linear_model.LogisticRegression(**svc_args)
    #cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456) #for l2
    cv = model_selection.ShuffleSplit(n_splits=3,
                                      test_size=0.1,
                                      random_state=123456)  #for l1
    scores = model_selection.cross_validate(
        clf,
        X,
        y,
        cv=cv,
        scoring=['precision', 'recall', 'f1'],
        return_train_score=True,
        return_estimator=True)
    print(scores)
    return [
        np.mean(scores['test_precision']),
        np.mean(scores['test_recall']),
        np.mean(scores['test_f1']),
        np.mean([model.coef_[0] for model in scores['estimator']], axis=0)
    ]
예제 #10
0
def identical_distribution_split(target,
                                 n_splits=3,
                                 train_size=0.7,
                                 test_size=0.3,
                                 cos_theta_lim=0.7):
    #
    cos_theta_list = [0.0]
    #
    while min(cos_theta_list) < cos_theta_lim:
        #
        del cos_theta_list[:]
        index = []
        cv = model_selection.ShuffleSplit(n_splits=n_splits,
                                          train_size=train_size,
                                          test_size=test_size)
        #
        for index1, index2 in cv.split(target):
            #测试集必须与训练集、验证集分开归一化处理,不可合并归一化
            dataset1 = target[index1][:, np.newaxis]
            dataset2 = target[index2][:, np.newaxis]
            #
            cos_theta, _, _ = data_similarity.hist_similarity(
                data_normalization(dataset1), data_normalization(dataset2))
            cos_theta_list.append(cos_theta)
            index.extend([index1, index2])
    #
    return cv, index, cos_theta_list
예제 #11
0
def sample(table, n=0.7, stratified=False, replace=False, random_state=None):
    """
    Samples data instances from a data table. Returns the sample and
    a dataset from input data table that are not in the sample. Also
    uses several sampling functions from
    `scikit-learn <http://scikit-learn.org>`_.

    table : data table
        A data table from which to sample.

    n : float, int (default = 0.7)
        If float, should be between 0.0 and 1.0 and represents
        the proportion of data instances in the resulting sample. If
        int, n is the number of data instances in the resulting sample.

    stratified : bool, optional (default = False)
        If true, sampling will try to consider class values and
        match distribution of class values
        in train and test subsets.

    replace : bool, optional (default = False)
        sample with replacement

    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.
    """

    if type(n) == float:
        n = int(n * len(table))

    if replace:
        if random_state is None:
            rgen = np.random
        else:
            rgen = np.random.mtrand.RandomState(random_state)
        sample = rgen.randint(0, len(table), n)
        o = np.ones(len(table))
        o[sample] = 0
        others = np.nonzero(o)[0]
        return table[sample], table[others]

    n = len(table) - n
    if stratified and table.domain.has_discrete_class:
        test_size = max(len(table.domain.class_var.values), n)
        splitter = skl.StratifiedShuffleSplit(
            n_splits=1,
            test_size=test_size,
            train_size=len(table) - test_size,
            random_state=random_state,
        )
        splitter.get_n_splits(table.X, table.Y)
        ind = splitter.split(table.X, table.Y)
    else:
        splitter = skl.ShuffleSplit(n_splits=1,
                                    test_size=n,
                                    random_state=random_state)
        splitter.get_n_splits(table)
        ind = splitter.split(table)
    ind = next(ind)
    return table[ind[0]], table[ind[1]]
def gridsearch_params(MLA_compare, X_train, y_train, top):
    """
    This function will return the best parameters for a model as a dictionary
    """
    best_classifiers = MLA_compare['MLA Name'].values[:top]
    best_cls_ind = MLA_compare['MLA Name'].index[:top]

    cv_split = model_selection.ShuffleSplit(n_splits=5, test_size=.2, train_size=.8, random_state=39)
    best_params_dict = {'cls': best_classifiers, 'param': [], 'score': []}
    start_total = time()

    for ind, clf in zip(best_cls_ind, best_classifiers):
        start = time()
        param = grid_param[ind]
        estimator = MLA[clf]
        # if estimator == 'XGBClassifier':
        #     break
        # else:
        best_search = model_selection.GridSearchCV(estimator=estimator,
                                                   param_grid=param,
                                                   cv=cv_split,
                                                   scoring='roc_auc',
                                                   n_jobs=-1)
        best_search.fit(X_train, y_train)
        run = time() - start
        best_param = best_search.best_params_
        best_params_dict['param'].append(MLA[clf].set_params(**best_param))
        best_params_dict['score'].append(best_search.best_score_)
        print(f'{clf}\nBest Parameters: {best_param}\nRuntime: {run:.2f} seconds.')
        print('-' * 10)

    run_total = time() - start_total
    print(f'Total optimization time was {(run_total / 60):.2f} minutes.')
    return best_params_dict
예제 #13
0
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234):
    v[cname], z[cname] = 0, 0
    np.random.seed(seed)
    build_model().summary(line_length=120)
    model_path = '../data/working/' + cname + '_keras_model.h5'
    ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
    scores = list()
    for n, (itrain, ival) in enumerate(ss.split(train3, y)):
        xtrain, xval = train3[itrain], train3[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(
                xtrain, ytrain,
                batch_size = 128,
                epochs=10000,
                validation_data=(xval, yval),
                verbose=0,
                callbacks=build_keras_fit_callbacks(model_path),
                shuffle=True
            )
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += pconvert(p).ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: '%(n+1), score, now())
        scores.append(score)
        z[cname] += pconvert(model.predict(test3)).ravel()
        del model
        for i in range(3): gc.collect(i)
    os.remove(model_path)

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits
예제 #14
0
    def _cross_validate_individual_classifier(self, classifier, test_size_ratio):
        cv_options = model_selection.ShuffleSplit(n_splits=10, test_size=test_size_ratio, random_state=RANDOM_SEED)
        X = self._all_predictor_variables
        y = self._all_response_variables

        out = model_selection.cross_validate(classifier, X, y, scoring="accuracy", cv=cv_options, return_train_score=True, verbose=1)
        return out
예제 #15
0
 def fit(self, X, y):
     print "Fitting a restricted ElasticNetCV regressor..."
     self.standardizer = preprocessing.StandardScaler()
     X = self.standardizer.fit_transform(X)
     cv = model_selection.ShuffleSplit(n_splits=5,
                                       test_size=0.2,
                                       random_state=0)
     alpha_range = [
         0.005, 0.007, 0.002, 0.0025, 0.004, 0.003, 0.0035877427142009029,
         0.01, 0.001
     ]
     param_grid = []
     param_grid.append(
         dict(alpha=alpha_range,
              l1_ratio=[.1, .2, .25, .3, .35, .4, .5, .6, .65, .7, .8],
              normalize=[True],
              max_iter=[10000]))
     print "Using param grid " + str(param_grid)
     self.clf = model_selection.GridSearchCV(linear_model.ElasticNet(),
                                             param_grid=param_grid,
                                             cv=cv,
                                             n_jobs=12)
     self.clf.fit(X, y)
     print "Best params: " + str(
         self.clf.best_params_) + " and corresponding score is " + str(
             self.clf.best_score_)
def run_classification(K, C):
    clf = svm.SVC(kernel='precomputed', probability=True)
    ss = model_selection.ShuffleSplit(n_splits=10, test_size=0.5)
    accuracy_scores = list()
    precision_scores = list()
    recall_scores = list()

    for train, test in ss.split(K):
        kTrain = K[train][:, train]
        cTrain = C[train]

        # For the test kernel matrix, the values between *all* training
        # vectors and all test vectors must be provided.
        #kTest  = K[train, test]
        kTest = K[test]
        kTest = kTest[:, train]

        cTest = C[test]

        clf.fit(kTrain, cTrain)
        prediction = clf.predict(kTest)

        accuracy_scores.append(accuracy_score(cTest, prediction))
        precision_scores.append(precision_score(cTest, prediction))
        recall_scores.append(recall_score(cTest, prediction))

    print("Average accuracy: ",
          sum(accuracy_scores) / len(accuracy_scores),
          file=sys.stderr)
    print("Average precision:",
          sum(precision_scores) / len(precision_scores),
          file=sys.stderr)
    print("Average recall:   ",
          sum(recall_scores) / len(recall_scores),
          file=sys.stderr)
예제 #17
0
    def getAndScoreVotingEnsemble(self,
                                  trainingDataFrame,
                                  predictorColumns,
                                  labelColumn,
                                  votingMethod="hard"):

        trainingInputs = trainingDataFrame[predictorColumns]
        #trainingInputs = preprocessing.normalize(trainingInputs, axis=0)

        trainingLabels = trainingDataFrame[labelColumn]

        cv_split = model_selection.ShuffleSplit(n_splits=10,
                                                test_size=.3,
                                                train_size=.7,
                                                random_state=0)

        voter = ensemble.VotingClassifier(estimators=self.MLA,
                                          voting=votingMethod)

        voter_cv = model_selection.cross_validate(voter,
                                                  trainingInputs,
                                                  trainingLabels,
                                                  cv=cv_split)

        voter.fit(trainingDataFrame[predictorColumns],
                  trainingDataFrame[labelColumn])

        print("{} Voting Training mean Score: {:.2f}".format(
            votingMethod, voter_cv['train_score'].mean() * 100))
        print("{} Voting Test mean Score: {:.2f}".format(
            votingMethod, voter_cv['test_score'].mean() * 100))
        print("{} Voting Test Score 3*std: +/- {:.2f}".format(
            votingMethod, voter_cv['test_score'].std() * 100 * 3))
        print('-' * 10)
def nationality_trainer_v1():
    # Load nationality list
    with codecs.open("nationalities.txt", "r", "utf-8") as f:
        nationalities = list(map(lambda x: x.strip().lower(), f.readlines()))
    # Load words list
    with codecs.open("words.txt", "r", "utf-8") as f:
        words = list(map(lambda x: x.strip().lower(), f.readlines()))

    random.shuffle(words)
    wordstrain = words[0:1 * len(nationalities)]
    # Results of nationalities list are true the others are false
    # These will be used in training
    ynat = [1 for x in nationalities]
    ywor = [0 for x in wordstrain]

    y = ynat + ywor

    # Combine all words into a big list and calculate length of each word
    totallist = nationalities + wordstrain
    X = [len(x) for x in totallist]

    # Prepare a matrix whose number of rows is the same as the big list
    # and whose number of cols is the same as the word with max length
    # separate them into letters, then two letters, then three letters
    longest = 25 + 24 + 23  # max(X)
    X = np.zeros((len(X), longest), dtype=np.int64)
    # convert each character of each word in the list and save them in the
    # value matrix we created above. For words of length less than max length,
    # remaning col values are zero. Do this for two and three letter pairs
    for i in range(len(totallist)):  #
        X[i, 0:len(totallist[i])] = np.array([ord(x) for x in totallist[i]])
        #two letters
        for j in range(0, len(totallist[i]) - 1):
            letter1 = ord(totallist[i][j])
            letter2 = ord(totallist[i][j + 1])
            X[i, (25 + j)] = int("{0}{1}".format(letter1, letter2))
        #three letters
        for j in range(0, len(totallist[i]) - 2):
            letter1 = ord(totallist[i][j])
            letter2 = ord(totallist[i][j + 1])
            letter3 = ord(totallist[i][j + 2])
            X[i,
              (25 + 24 + j)] = int("{0}{1}{2}".format(letter1, letter2,
                                                      letter3))

    # Load neural network classifier
    # hiddenlayers = (100,)
    # clf = neural_network.MLPClassifier(hidden_layer_sizes = hiddenlayers)

    # Load Support Vector Machine
    # clf = svm.SVC()

    # Load Naive Bayes
    clf = naive_bayes.MultinomialNB()

    # Use shuffle split for N way splitting
    cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.3)
    # score the classifier
    print(model_selection.cross_val_score(clf, X, y, cv=cv))
예제 #19
0
def quick_fitted_tree(X,
                      Y,
                      model_type=['GridSearch', 'FeatureSelection'],
                      test_split=None,
                      random_state=None):
    from sklearn import tree, model_selection, feature_selection

    splitted_data = None
    sel_cols = None
    x = X.copy()
    y = Y.copy()
    if isinstance(test_split, (float)):
        x, x_test, y, y_test = model_selection.train_test_split(
            x, y, test_size=test_split, random_state=random_state)
    cv_split = model_selection.ShuffleSplit(n_splits=10,
                                            test_size=.3,
                                            train_size=.6,
                                            random_state=random_state)
    dtree = tree.DecisionTreeClassifier(random_state=random_state)
    model = dtree

    if 'FeatureSelection' in model_type:
        dtree_rfe = feature_selection.RFECV(
            tree.DecisionTreeClassifier(random_state=random_state),
            step=1,
            scoring='accuracy',
            cv=cv_split)
        dtree_rfe.fit(x, y)
        x = x[:, dtree_rfe.get_support()]
        if isinstance(test_split, (float)):
            x_test = x_test[:, dtree_rfe.get_support()]
        sel_cols = dtree_rfe.get_support()
    if 'GridSearch' in model_type:
        param_grid = {
            'criterion': ['gini', 'entropy'],
            'max_depth': [2, 4, 6, 8, 10, None],
            'random_state': [0],
            #'splitter': ['best', 'random'],
            #'min_samples_split': [2,5,10,.03,.05],
            #'min_samples_leaf': [1,5,10,.03,.05],
            #'max_features': [None, 'auto'],
        }
        model = model_selection.GridSearchCV(
            tree.DecisionTreeClassifier(random_state=random_state),
            param_grid=param_grid,
            scoring='accuracy',
            cv=cv_split)

    model.fit(x, y)
    if model_type != None:
        model = model.best_estimator_

    if isinstance(test_split, (float)):
        splitted_data = (x, x_test, y, y_test)
    else:
        splitted_data = (None, x, None, y)
    return model, sel_cols, splitted_data
예제 #20
0
def bayes_search():
    data, target = load_train()

    pipeline = create_pipeline()

    data = pipeline.fit_transform(data)

    nrmse_scorer = make_scorer(lambda x, y: rmse(x, y) * -1)

    bayes_cv_tuner = BayesSearchCV(
        estimator = XGBRegressor(),
        search_spaces = {
            'learning_rate': (0.01, 1.0, 'log-uniform'),
            'min_child_weight': (0, 10),
            'max_depth': (0, 50),
            'max_delta_step': (0, 20),
            'subsample': (0.01, 1.0, 'uniform'),
            'colsample_bytree': (0.01, 1.0, 'uniform'),
            'colsample_bylevel': (0.01, 1.0, 'uniform'),
            'reg_lambda': (1e-9, 1000, 'log-uniform'),
            'reg_alpha': (1e-9, 1.0, 'log-uniform'),
            'gamma': (1e-9, 0.5, 'log-uniform'),
            'min_child_weight': (0, 5),
            'n_estimators': (50, 2000),
            'scale_pos_weight': (1e-6, 500, 'log-uniform')
        },
        scoring = nrmse_scorer,
        cv = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ),
        n_jobs = 1,
        n_iter = 10000,
        verbose = 0,
        refit = True,
        random_state = 42
    )


    def status_print(optim_result):
        """Status callback durring bayesian hyperparameter search"""

        # Get all the models tested so far in DataFrame format
        all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)

        # Get current parameters and the best parameters
        best_params = pd.Series(bayes_cv_tuner.best_params_)
        print('Model #{}\nBest MSE: {}\nBest params: {}\n'.format(
            len(all_models),
            np.round(bayes_cv_tuner.best_score_, 4),
            bayes_cv_tuner.best_params_
        ))

        # Save all model results
        clf_name = bayes_cv_tuner.estimator.__class__.__name__
        all_models.to_csv(clf_name+"_cv_results.csv")


    # Fit the model
    result = bayes_cv_tuner.fit(data, target, callback=status_print)
def cross_validation_split_2():
    # split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
    # #sklearn.model_selection.ShuffleSplit
    # note: this is an alternative to train_test_split
    # split training dataset into 0.6:0.3:0.1 subsets and return index of those subset, run model 10 times with 60/30 split intentionally leaving out 10%
    cv_split = model_selection.ShuffleSplit(n_splits=10,
                                            test_size=.3,
                                            train_size=.6,
                                            random_state=0)
    return cv_split
예제 #22
0
def split_train_test_shuffle(data, n_splits, test_ratio, seed=None):
    train_set = pd.DataFrame(data=None, columns=data.columns, index=data.index)
    test_set = pd.DataFrame(data=None, columns=data.columns, index=data.index)
    rs = ms.ShuffleSplit(n_splits=n_splits,
                         test_size=test_ratio,
                         random_state=seed)
    for train_idx, test_idx in rs.split(data):
        train_set = data.loc[train_idx]
        test_set = data.loc[test_idx]
    return train_set, test_set
예제 #23
0
def plot_learning_curve_1(model,
                          X,
                          y,
                          n_splits=3,
                          train_size=0.8,
                          test_size=0.2,
                          scoring="neg_mean_squared_error",
                          train_sizes=np.linspace(0.1, 1.0, 5)):

    train_scores = []
    cv_scores = []
    #
    subset_sizes = X.shape[0] * train_sizes
    subset_sizes = subset_sizes.astype(int)
    cv = model_selection.ShuffleSplit(n_splits=n_splits,
                                      train_size=train_size,
                                      test_size=test_size)

    for m in subset_sizes:
        X_train_cv = X[:m]
        y_train_cv = y[:m]
        #
        scores = model_selection.cross_validate(model,
                                                X_train_cv,
                                                y_train_cv,
                                                scoring=scoring,
                                                cv=cv,
                                                n_jobs=-1,
                                                return_train_score=True)
        #
        tain_score = scores.get("train_score")
        cv_score = scores.get("test_score")
        #
        train_scores.append(tain_score)
        cv_scores.append(cv_score)
    #
    train_scores = np.mean(np.array(train_scores), axis=1)
    cv_scores = np.mean(np.array(cv_scores), axis=1)
    #
    fig = plt.figure()
    ax = fig.add_subplot(111)
    #ax.set_xscale("log")
    ax.set_xlabel("Training size")
    ax.set_ylabel('Score')
    ax.set_title('Learning curve')
    training_score, = ax.plot(subset_sizes, train_scores, lw=2)
    cv_score, = ax.plot(subset_sizes, cv_scores, lw=2)
    ax.legend(handles=[training_score, cv_score],
              labels=["Training score", "Cross validation score"],
              loc="upper right",
              prop={"size": 8})
    ###
    #
    plt.tight_layout()
    plt.show()
예제 #24
0
def learning_curve_example():
	# REF [site] >> http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
	digits = datasets.load_digits()
	X, y = digits.data, digits.target

	title = 'Learning Curves (Naive Bayes)'
	# Cross validation with 100 iterations to get smoother mean test and train score curves, each time with 20% data randomly selected as a validation set.
	cv = model_selection.ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

	estimator = naive_bayes.GaussianNB()
	plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

	title = 'Learning Curves (SVM, RBF kernel, $\gamma=0.001$)'
	# SVC is more expensive so we do a lower number of CV iterations:
	cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
	estimator = svm.SVC(gamma=0.001)
	plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

	plt.show()

	# REF [site] >> http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
	digits = datasets.load_digits()
	X, y = digits.data, digits.target

	param_range = np.logspace(-6, -1, 5)
	train_scores, test_scores = model_selection.validation_curve(svm.SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='accuracy', n_jobs=1)
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_std = np.std(train_scores, axis=1)
	test_scores_mean = np.mean(test_scores, axis=1)
	test_scores_std = np.std(test_scores, axis=1)

	plt.title('Validation Curve with SVM')
	plt.xlabel('$\gamma$')
	plt.ylabel('Score')
	plt.ylim(0.0, 1.1)
	lw = 2
	plt.semilogx(param_range, train_scores_mean, label='Training score', color='darkorange', lw=lw)
	plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color='darkorange', lw=lw)
	plt.semilogx(param_range, test_scores_mean, label='Cross-validation score', color='navy', lw=lw)
	plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color='navy', lw=lw)
	plt.legend(loc='best')
	plt.show()
    def assertClassifierWorksWithCV(self, classifier):
        # all the nice stuff is tested here - whether the classifier is
        # clonable, etc.
        for X, y in self.get_multilabel_data_for_tests('dense'):
            n_iterations = 3
            cv = model_selection.ShuffleSplit(n_splits=n_iterations, test_size=0.5, random_state=0)

            scores = model_selection.cross_val_score(
                classifier, X, y=y, cv=cv, scoring='accuracy')

            self.assertEqual(len(scores), n_iterations)
예제 #26
0
 def fit(self, X, y):
     print "Fitting a RidgeCV regressor..."
     self.standardizer = preprocessing.StandardScaler()
     X = self.standardizer.fit_transform(X)
     cv = model_selection.ShuffleSplit(n_splits=5,
                                       test_size=0.2,
                                       random_state=0)
     self.clf = linear_model.RidgeCV(alphas=[0.01, 0.1, 1., 10.],
                                     cv=cv,
                                     normalize=[True, False])
     self.clf.fit(X, y)
예제 #27
0
 def fit(self, X, y):
     print "Fitting a LassoCV regressor..."
     self.standardizer = preprocessing.StandardScaler()
     X = self.standardizer.fit_transform(X)
     cv = model_selection.ShuffleSplit(n_splits=5,
                                       test_size=0.2,
                                       random_state=0)
     self.clf = linear_model.LassoCV(n_alphas=100,
                                     cv=cv,
                                     n_jobs=7,
                                     normalize=[True, False])
     self.clf.fit(X, y)
예제 #28
0
    def assertClassifierWorksWithCV(self, classifier):
        # all the nice stuff is tested here - whether the classifier is
        # clonable, etc.
        X, y = make_multilabel_classification(
            sparse=False, return_indicator='dense')
        n_iterations = 3
        cv = model_selection.ShuffleSplit(n_splits=n_iterations, test_size=0.5, random_state=0)

        scores = model_selection.cross_val_score(
            classifier, X, y=y, cv=cv, scoring='accuracy')

        self.assertEqual(len(scores), n_iterations)
예제 #29
0
 def _cross_validate_individual_classifier_number(self, classifier, number_of_training_observations):
     
     X = self._all_predictor_variables
     y = self._all_response_variables
     
     total_observations = len(X)
     
     test_size_count = total_observations - number_of_training_observations
     
     cv_options = model_selection.ShuffleSplit(n_splits=10, test_size=test_size_count, random_state=RANDOM_SEED)
     out = model_selection.cross_validate(classifier, X, y, scoring="accuracy", cv=cv_options, return_train_score=True, verbose=1)
     return out
예제 #30
0
def apply_svm_cross_validation(X, y, svc_args={'loss':'hinge', 'penalty':'elasticnet', 'max_iter':1000, 'alpha':1e-9, 'tol':1e-3, 'random_state':123456, 'class_weight':None}, kernel_args={'kernel':'rbf', 'gamma':None, 'degree':None, 'n_components':100, 'random_state':123456}):
    #print("kernel_approx")
    #feature_map_nystroem = kernel_approximation.Nystroem(**kernel_args)
    #feature_map_nystroem.fit(X)
    #X_new = feature_map_nystroem.transform(X)

    print("SVM")
    clf = linear_model.SGDClassifier(**svc_args)
    cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456)
    scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True)
    print(scores)
    return [np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1'])]