def _random_search(self, random_iter, x, y, kernel_cache_size):
        # Default Values
        c = 1.0
        gamma = 0.0
        nu = 0.5
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            param_dist = {
                "C": numpy.power(2.0, range(-5, 16)),
                "gamma": numpy.power(2.0, range(-15, 4)),
                "nu": uniform(loc=0.0001, scale=1 - 0.0001)
            }
            param_list = [
                {
                    "C": c,
                    "gamma": gamma,
                    "nu": nu
                },
            ]
            param_list.extend(
                list(
                    ParameterSampler(param_dist,
                                     n_iter=random_iter - 1,
                                     random_state=self._rng)))
            for idx, d in enumerate(param_list):
                nusvr = NuSVR(kernel='rbf',
                              gamma=d['gamma'],
                              C=d['C'],
                              nu=d['nu'],
                              random_state=self._rng,
                              cache_size=kernel_cache_size)
                train_x, test_x, train_y, test_y = \
                    train_test_split(x, y, test_size=0.5, random_state=self._rng)
                self._check_scaling(scaled_x=train_x)
                nusvr.fit(train_x, train_y)
                sc = nusvr.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    c = d['C']
                    gamma = d['gamma']
                    nu = d['nu']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using C: %f, nu: %f and Gamma: %f\n" %
                             (c, nu, gamma))
        return nu, c, gamma
예제 #2
0
def _test_diabetes_compare_with_sklearn(kernel):
    diabetes = datasets.load_diabetes()
    clf_onedal = NuSVR(kernel=kernel, nu=.25, C=10.)
    clf_onedal.fit(diabetes.data, diabetes.target)
    result = clf_onedal.score(diabetes.data, diabetes.target)

    clf_sklearn = SklearnNuSVR(kernel=kernel, nu=.25, C=10.)
    clf_sklearn.fit(diabetes.data, diabetes.target)
    expected = clf_sklearn.score(diabetes.data, diabetes.target)

    assert result > expected - 1e-5
    assert_allclose(clf_sklearn.intercept_, clf_onedal.intercept_, atol=1e-4)
    assert_allclose(clf_sklearn.support_vectors_.shape,
                    clf_sklearn.support_vectors_.shape)
    assert_allclose(clf_sklearn.dual_coef_, clf_onedal.dual_coef_, atol=1e-2)
예제 #3
0
def cv_nu_SVR(X, y, K, C_test, nu_test):
    Accuracy = np.zeros((len(C_test), len(nu_test)))

    Xcv, Ycv = create_cv_set(X, y, K)

    k1 = 0
    for c in C_test:
        k2 = 0
        for nu in nu_test:

            current_acc = 0.0

            for n in range(K):

                svc = NuSVR(C=c, nu=nu)

                X_train, y_train, X_test, y_test = create_train_set(
                    Xcv, Ycv, n)

                #On entraine le SVM
                svc.fit(X_train, y_train)

                res_tmp = svc.score(X_test, y_test)

                current_acc = current_acc + res_tmp / (1.0 * K)

            Accuracy[k1, k2] = current_acc

            k2 = k2 + 1
        k1 = k1 + 1

    acc_test = 0
    C_opt = 0
    nu_opt = 0
    for k1 in range(Accuracy.shape[0]):
        for k2 in range(Accuracy.shape[1]):
            if (Accuracy[k1, k2] > acc_test):
                acc_test = Accuracy[k1, k2]
                C_opt = C_test[k1]
                nu_opt = nu_test[k2]

    print("NuSVR, Parametres optimaux: C=", C_opt, " nu=", nu_opt)

    return C_opt, nu_opt
    def _random_search(self, random_iter, x, y, kernel_cache_size):
        # Default Values
        c = 1.0
        gamma = 0.0
        nu = 0.5
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            param_dist = {"C": numpy.power(2.0, range(-5, 16)),
                          "gamma": numpy.power(2.0, range(-15, 4)),
                          "nu": uniform(loc=0.0001, scale=1-0.0001)}
            param_list = [{"C": c, "gamma": gamma, "nu": nu}, ]
            param_list.extend(list(ParameterSampler(param_dist,
                                                    n_iter=random_iter-1,
                                                    random_state=self._rng)))
            for idx, d in enumerate(param_list):
                nusvr = NuSVR(kernel='rbf',
                              gamma=d['gamma'],
                              C=d['C'],
                              nu=d['nu'],
                              random_state=self._rng,
                              cache_size=kernel_cache_size)
                train_x, test_x, train_y, test_y = \
                    train_test_split(x, y, test_size=0.5, random_state=self._rng)
                self._check_scaling(scaled_x=train_x)
                nusvr.fit(train_x, train_y)
                sc = nusvr.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    c = d['C']
                    gamma = d['gamma']
                    nu = d['nu']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using C: %f, nu: %f and Gamma: %f\n" %
                             (c, nu, gamma))
        return nu, c, gamma
예제 #5
0
df = df.iloc[:2949, :]
import pickle
df.to_pickle("Final_Data")
df.read_pickle("Final_Data")

for idx, row in output_df.iterrows():
    df.loc[row['FIPS'], 'annual_count_avg'] = row['Average Annual Count']

X = df.loc[:, :'WATR']
y = df['annual_count_avg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.svm import LinearSVR
svr = LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train)
svr.score(X_test, y_test)

from sklearn import svm
svm = svm.SVR().fit(X_train, y_train)
svm.score(X_test, y_test)

from sklearn.svm import NuSVR
nuSVR = NuSVR().fit(X_train, y_train)
nuSVR.score(X_test, y_test)

from sklearn import linear_model
ridge = linear_model.Ridge(alpha=0.5).fit(X_train, y_train)
ridge.score(X_test, y_test)
np.argmax(ridge.coef_)
예제 #6
0
class AllRegressionModels:
    """
    Wrapper class around all supported regression models: LinearRegression, RandomForest, SVR, NuSVR, LinearSVR, and
    XGBRegressor.
    AllRegressionModels runs every available regression algorithm on the given dataset and outputs the coefficient of
    determination and execution time of each successful model when all_regression_models() is run.
    """
    def __init__(self, attributes=None, labels=None, test_size=0.25, verbose=False):
        """
        Initializes an AllRegressionModels object.

        The following parameters are needed to use an AllRegressionModels object:

            – attributes: a numpy array of the desired independent variables (Default is None)
            – labels: a numpy array of the desired dependent variables (Default is None)
            – test_size: the proportion of the dataset to be used for testing the model;
            the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25)
            – verbose: specifies whether or not to ouput any and all logging during model training (Default is False)

            Note: These are the only parameters allowed. All other parameters for each model will use their default
            values. For more granular control, please instantiate each model individually.

        The following instance data is found after running all_regression_models() successfully:

            – linear_regression: a reference to the LinearRegression model
            – random_forest: a reference to the RandomForest model
            – SVR: a reference to the SVR model
            – nu_SVR: a reference to the NuSVR model
            – linear_SVR: a reference to the LinearSVR model
            – XGB_regressor: a reference to the XGBRegressor model
        
        After running all_regression_models(), the coefficient of determination and execution time for each model that
        ran successfully will be displayed in tabular form. Any models that failed to run will be listed.
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = test_size
        self.verbose = verbose

        self.linear_regression = LinearRegression()
        self.random_forest = RandomForestRegressor(verbose=self.verbose)
        self.SVR = SVR(verbose=self.verbose)
        self.nu_SVR = NuSVR(verbose=self.verbose)
        self.linear_SVR = LinearSVR(verbose=self.verbose)
        self.XGB_regressor = XGBRegressor(verbosity=int(self.verbose))

        self._regression_models = {"Model": ["R2 Score", "Time"]}
        self._failures = []

    # Accessor methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If an AllRegressionModels object is initialized without specifying attributes, attributes will be None.
        all_regression_models() cannot be called until attributes is a populated numpy array of independent variables;
        call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If an AllRegressionModels object is initialized without specifying labels, labels will be None.
        all_regression_models() cannot be called until labels is a populated numpy array of dependent variables;
        call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_verbose(self):
        """
        Accessor method for verbose.

        Will default to False if not set by the user.
        """
        return self.verbose

    def get_all_regression_models(self):
        """
        Accessor method that returns a list of all models.

        All models within the list will be None if all_regression_models() hasn't been called, yet.
        """
        return [self.linear_regression, self.random_forest, self.SVR, self.nu_SVR, self.linear_SVR, self.XGB_regressor]

    def get_linear_regression(self):
        """
        Accessor method for linear_regression.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.linear_regression

    def get_random_forest(self):
        """
        Accessor method for random_forest.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.random_forest

    def get_SVR(self):
        """
        Accessor method for SVR.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.SVR

    def get_nu_SVR(self):
        """
        Accessor method for nu_SVR.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.nu_SVR

    def get_linear_SVR(self):
        """
        Accessor method for linear_SVR.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.linear_SVR

    def get_XGB_regressor(self):
        """
        Accessor method for XGB_regressor.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.XGB_regressor

    # Modifier methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a numpy array of independent variables. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a numpy array of dependent variables. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a number or None. Defaults to 0.25.
        """
        self.test_size = new_test_size

    def set_verbose(self, new_verbose=False):
        """
        Modifier method for verbose.

        Input should be a truthy/falsy value. Defaults to False.
        """
        self.verbose = new_verbose

    # Regression functionality

    def all_regression_models(self):
        """
        Driver method for running all regression models with given attributes and labels.
        all_regression_models() first trains the models and determines their coefficients of determination and
        execution time via _all_regression_models_runner(). Then, all_regression_models() calls _print_results() to
        format and print each successful model's measurements, while also listing any failed models.

        If verbose is True, all verbose logging for each model will be enabled.
        If verbose is False, all logging to stdout and stderr will be suppressed.
        """

        # Call helper method for running all regression models; suppress output, if needed
        if not self.verbose:
            suppress_output = io.StringIO()
            with redirect_stderr(suppress_output), redirect_stdout(suppress_output):
                self._all_regression_models_runner()
        else:
            self._all_regression_models_runner()
        
        # Print results
        self._print_results()
        
    # Helper methods

    def _all_regression_models_runner(self):
        """
        Helper method that runs all models using the given dataset and all default parameters.
        After running all models, each model is determined to be either a success or failure, and relevant data
        (R2 score, execution time) is recorded.

        _all_regression_models_runner() may only be called by all_regression_models().
        """

        # Split dataset
        dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

        # Run and time all models; identify each as success or failure
        try:
            start_time = time.time()
            self.linear_regression.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["LinearRegression"] =\
                [self.linear_regression.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("LinearRegression")

        try:
            start_time = time.time()
            self.random_forest.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["RandomForest"] =\
                [self.random_forest.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("RandomForest")

        try:        
            start_time = time.time()
            self.SVR.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["SVR"] = [self.SVR.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("SVR")
        
        try:
            start_time = time.time()
            self.nu_SVR.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["NuSVR"] = [self.nu_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("NuSVR")

        try:
            start_time = time.time()
            self.linear_SVR.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["LinearSVR"] =\
                [self.linear_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("LinearSVR")

        try:
            start_time = time.time()
            self.XGB_regressor.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["XGBRegressor"] =\
                [self.XGB_regressor.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("XGBRegressor")
        
    def _print_results(self):
        """
        Helper method that prints results of _all_regression_models_runner() in tabular form.

        _print_results() may only be called by all_regression_models() after all models have attempted to run.
        """

        # Print models that didn't fail
        print("\nResults:\n")

        for model, data in self._regression_models.items():
            print("{:<20} {:<20} {:<20}".format(model, data[0], data[1]))

        print()

        # Print failures, if any
        if len(self._failures) > 0:
            print("The following models failed to run:\n")

            for entry in self._failures:
                print(entry)
        
        print()
예제 #7
0
print 'NuSVC config:'
print nusvc.get_params()
nusvc.fit(smr_train.feature_matrix, smr_train.labels)
nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVC precision train: {}'.format(nusvc_score_train)
nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVC precision test: {}'.format(nusvc_score_test)
print ''

nusvr = NuSVR()
print 'NuSVR config:'
print nusvr.get_params()
nusvr.fit(smr_train.feature_matrix, smr_train.labels)
nusvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVR precision train: {}'.format(nusvr_score_train)
nusvr_score_test = nusvr.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVR precision test: {}'.format(nusvr_score_test)
print ''


dtc = DecisionTreeClassifier()
print 'DecisionTreeClassifier config:'
print dtc.get_params()
dtc.fit(smr_train.feature_matrix, smr_train.labels)
dtc_score_train = dtc.score(smr_train.feature_matrix, smr_train.labels)
print 'DecisionTreeClassifier precision train: {}'.format(dtc_score_train)
dtc_score_test = dtc.score(smr_test.feature_matrix, smr_test.labels)
print 'DecisionTreeClassifier precision test: {}'.format(dtc_score_test)
print classification_report(smr_test.labels, dtc.predict(smr_test.feature_matrix))
print ''
예제 #8
0
def runTcheby():
    global param, approx_pareto_front, archiveOK, NO_FILE_TO_WRITE

    ############################################################################
    # PARAMETER

    #clf = SVR(C=1.0, epsilon=0.1, kernel="rbf")
    clf = NuSVR(cache_size=2000, shrinking=True,verbose=True)
    clf2 = -1
    two_models_bool = False

    isReals = True
    start_fct, nb_functions                = param[0:2]
    nb_iterations, neighboring_size        = param[2:4]
    init_decisions, problem_size           = param[4:6]
    max_decisions_maj, delta_neighbourhood = param[6:8]
    CR, search_space                       = param[8:10]
    F, distrib_index_n                     = param[10:12]
    pm, operator_fct                       = param[12:14]
    nb_samples, training_neighborhood_size = param[14:16]
    strategy, file_to_write                = param[16:18]
    filter_strat, free_eval                = param[18:20]
    param_print_every, file_to_writeR2     = param[20:22]
    filenameDIR, filenameSCORE             = param[22:24]


    nb_objectives = len(start_fct)

    #get separatly offspring operator fct
    crossover_fct, mutation_fct, repair_fct = operator_fct

    best_decisions = copy.deepcopy(init_decisions)

    sampling_param = [crossover_fct, mutation_fct, repair_fct, best_decisions, F, problem_size, CR, search_space, distrib_index_n, pm]


    ############################################################################
    # INITIALISATION

    qual_tools.resetGlobalVariables(filenameDIR, filenameSCORE, nb_iterations, nb_functions)

    eval_to.resetEval()

    #get the directions weight for both starting functions
    directions = dec.getDirections(nb_functions, nb_objectives)

    #init the neighboring constant
    nt.initNeighboringTab(nb_functions, neighboring_size, directions, nb_objectives)

    #giving global visibility to the best_decisions to get the result at the end
    approx_pareto_front = best_decisions

    #initial best decisions scores
    best_decisions_scores = [eval_to.free_eval(start_fct, best_decisions[i], problem_size) for i in range(nb_functions)]

    pop_size = nb_functions

    #current optimal scores for both axes
    z_opt_scores = gt.getMinTabOf(best_decisions_scores)

    eval_to.initZstar(z_opt_scores)

    #get the first training part of the item we will learn on
    model_directions = train_to.getDirectionsTrainingMatrix(directions)

    #if the data shall be write in a file
    writeOK = False
    if(file_to_write != NO_FILE_TO_WRITE):
        writeOK = True

    writeR2OK = False
    if(file_to_writeR2 != NO_FILE_TO_WRITE):
        writeR2OK = True

    ############################################################################
    # MAIN ALGORITHM

    if(writeOK):
        iot.printObjectives(file_to_write, eval_to.getNbEvals(), 0,best_decisions_scores, problem_size, nb_objectives)

    #set of all the solution evaluated
    all_decisions        = copy.deepcopy(best_decisions)
    all_decisions_scores = copy.deepcopy(best_decisions_scores)
    all_len = nb_functions

    #IDs tab to allow a random course through the directions in the main loop
    id_directions = [i for i in range(nb_functions)]

    #iterations loop
    for itera in range(nb_iterations):
        #Update model
        training_inputs, training_outputs, training_set_size, training_scores = train_to.getTrainingSet(model_directions, all_decisions, all_decisions_scores ,eval_to.getZstar_with_decal(), strategy, nb_functions, training_neighborhood_size)
        print(len(training_outputs))
        clf.fit(training_inputs, training_outputs)
        if(writeR2OK):
            training_inputs_tcheby      = eval_to.getManyTcheby(training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size)

            random_index = numpy.arange(0,training_set_size)
            numpy.random.shuffle(random_index)
            n_folds = 10
            folds_sizes = (training_set_size // n_folds) * numpy.ones(n_folds, dtype=numpy.int)
            folds_sizes[:training_set_size % n_folds] += 1

            training_inputs_array = numpy.array(training_inputs)
            training_tcheby_array = numpy.array(training_inputs_tcheby)

            R2_cv = []
            MSE_cv = []
            MAE_cv = []
            MDAE_cv = []

            clfCV = NuSVR()

            current = 0
            for fold_size in folds_sizes:
                start, stop = current, current + fold_size
                mask = numpy.ones(training_set_size, dtype=bool)
                mask[start:stop] = 0
                current = stop

                clfCV.fit(training_inputs_array[random_index[mask]], training_tcheby_array[random_index[mask]])

                test_fold_tcheby = training_tcheby_array[random_index[start:stop]]
                test_fold_predict = clfCV.predict(training_inputs_array[random_index[start:stop]])

                R2_cv  .append(r2_score             (test_fold_tcheby, test_fold_predict))
                MSE_cv .append(mean_squared_error   (test_fold_tcheby, test_fold_predict))
                MAE_cv .append(mean_absolute_error  (test_fold_tcheby, test_fold_predict))
                MDAE_cv.append(median_absolute_error(test_fold_tcheby, test_fold_predict))

            R2 = clf.score(training_inputs, training_outputs)
            MSE_cv_mean = numpy.mean(MSE_cv)
            RMSE_cv_mean = math.sqrt(MSE_cv_mean)
            MAE_cv_mean = numpy.mean(MAE_cv)
            MDAE_cv_mean = numpy.mean(MDAE_cv)
            R2_cv_mean = numpy.mean(R2_cv)

            iot.printR2(file_to_writeR2, eval_to.getNbEvals(), itera,  R2, R2_cv_mean, MSE_cv_mean , MAE_cv_mean, MDAE_cv_mean, RMSE_cv_mean, problem_size, print_every=1)

        #random course through the directions
        random.shuffle(id_directions)

        #functions loop
        for f in id_directions:

            #get all the indice of neighbors of a function in a certain distance of f and include f in
            f_neighbors, current_neighbourhing_size = nt.getNeighborsOf(f, delta_neighbourhood)

            #get a list of offspring from the neighbors
            list_offspring = samp_to.extended_sampling(f, f_neighbors, sampling_param, nb_samples)

            #apply a filter on the offspring list and select the best one
            filter_param = [itera, f, clf, clf2, two_models_bool, f_neighbors, list_offspring, model_directions, start_fct, problem_size, eval_to.getZstar_with_decal(), best_decisions_scores, best_decisions, nb_objectives]
            best_candidate = filt_to.model_based_filtring(filter_strat, free_eval, filter_param)

            #evaluation of the newly made solution
            mix_scores = eval_to.eval(start_fct, best_candidate, problem_size)

            #MAJ of the z_star point
            has_changed = eval_to.min_update_Z_star(mix_scores, nb_objectives)

            #retraining of the model with the new z_star
            if(has_changed):
                train_to.updateTrainingZstar(eval_to.getZstar_with_decal())
                training_outputs = train_to.retrainSet(training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size, nb_objectives)
                clf.fit(training_inputs, training_outputs)

            #add to training input
            new_input = []
            new_input.extend(best_candidate)
            all_decisions.append(new_input)
            all_decisions_scores.append(mix_scores)
            all_len += 1

            #boolean that is True if the offspring has been add to the archive
            added_to_S = False

            #count how many best decisions has been changed by the newly offspring
            cmpt_best_maj = 0

            #random course through the neighbors list
            random.shuffle(f_neighbors)

            #course through the neighbors list
            for j in f_neighbors:

                #stop if already max number of remplacement reach
                if(cmpt_best_maj >= max_decisions_maj):
                    break


                #compute g_tcheby
                #wj = (directions[0][j],directions[1][j])
                wj = [directions[obj][j] for obj in range(0,nb_objectives)]
                g_mix = eval_to.g_tcheby(wj, mix_scores, eval_to.getZstar_with_decal())
                g_best = eval_to.g_tcheby(wj, best_decisions_scores[j], eval_to.getZstar_with_decal())


                #if the g_tcheby of the new solution is less distant from the z_optimal solution than the current best solution of the function j
                if(g_mix < g_best):
                    cmpt_best_maj += 1
                    best_decisions[j] = best_candidate
                    best_decisions_scores[j] = mix_scores

                    #if we manage the archive and the solution have not been add already
                    if(archiveOK and not(added_to_S)):
                       arch_to.archivePut(best_candidate, mix_scores)
                       added_to_S = True

        #print("Update", itera, "done.")

        #if manage archive
        if(archiveOK):
           arch_to.maintain_archive()

        #if write the result in a file
        if(writeOK):
            iot.printObjectives(file_to_write, eval_to.getNbEvals(), itera+1, best_decisions_scores, problem_size, nb_objectives, print_every=param_print_every)
            continue
        #graphic update
        #yield arch_to.getArchiveScore(), best_decisions_scores, itera+1, eval_to.getNbEvals(), eval_to.getZstar_with_decal(), pop_size, isReals
    if(not free_eval and writeR2OK):
        qual_tools.computeQualityEvaluation()
        qual_tools.generateDiffPredFreeFile()
    return
예제 #9
0
# Fitting SVR

svr2 = SVR(kernel='rbf', gamma=1, epsilon=0.1)
svr2.fit(x[:, None], y)

xx = np.linspace(-4, 4, 100)

yy = svr2.predict(xx[:, None])

# Fitting NuSVR

svr3 = NuSVR(kernel='rbf', gamma=1, nu=0.9)
svr3.fit(x[:, None], y)

yy2 = svr3.predict(xx[:, None])

# Compare performance : R-square
svr2.score(x[:, None], y)
svr3.score(x[:, None], y)

#%%

# Visualize

plt.scatter(x, y)
plt.plot(xx, yy, 'k', label='SVR')
plt.plot(xx, yy2, 'r:', label="NuSVR")
plt.legend(fontsize=14)

#%%
예제 #10
0
class SVM:
    """
    Wrapper class around scikit-learn's support vector machine functionality.
    This class supports binary and multi-class classification on a dataset, along with regression via Support Vector
    Regression (SVR).
    Per scikit-learn's documentation:

    Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and
    outliers detection.

    The advantages of support vector machines are:

        – Effective in high dimensional spaces.
        – Still effective in cases where number of dimensions is greater than the number of samples.
        – Uses a subset of training points in the decision function (called support vectors), so it is also memory
        efficient.
        – Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided,
        but it is also possible to specify custom kernels.

    The disadvantages of support vector machines include:

        – If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel
        functions and regularization term is crucial.
        – SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold
        cross-validation.
    """
    def __init__(self, attributes=None, labels=None, test_size=0.25):
        """
        Initializes a SVM object.

        The following parameters are needed to use a SVM:

            – attributes: a numpy array of the independent variables
            – labels: a numpy array of the classes (for classification) or dependent variables (for regression)
            – test_size: the proportion of the dataset to be used for testing the model (defaults to 0.25);
            the proportion of the dataset to be used for training will be the complement of test_size

        After successfully running one of the classifier methods (SVC(), nu_SVC(), or linear_SVC()), the corresponding
        classifier below will be trained:

            – classifier_SVC: a classifier trained using scikit-learn's SVC implementation
            – accuracy_SVC: the accuracy of the SVC model, based on its predictions for dataset_X_test
            – roc_auc_SVC: the area under the ROC curve for the SVC model
            – classifier_nu_SVC: a classifier trained using scikit-learn's NuSVC implementation
            – accuracy_nu_SVC: the accuracy of the NuSVC model, based on its predictions for dataset_X_test
            – roc_auc_nu_SVC: the area under the ROC curve for the NuSVC model
            – classifier_linear_SVC: a classifier trained using scikit-learn's LinearSVC implementation
            – accuracy_linear_SVC: the accuracy of the LinearSVC model, based on its predictions for dataset_X_test

        After successfully running one of the regression methods (SVR(), nu_SVR(), or linear_SVR()), the corresponding
        regression model below will be trained:

            – regression_SVR: a regression model trained using scikit-learn's SVR implementation
            – r2_score_SVR: the coefficient of determination for the SVR model
            – r_score_SVR: the correlation coefficient for the SVR model
            – regression_nu_SVR: a regression model trained using scikit-learn's NuSVR implementation
            – r2_score_nu_SVR: the coefficient of determination for the NuSVR model
            – r_score_nu_SVR: the correlation coefficient for the NuSVR model
            – regression_linear_SVR: a regression model trained using scikit-learn's LinearSVR implementation
            – r2_score_linear_SVR: the coefficient of determination for the LinearSVR model
            – r_score_linear_SVR: the correlation coefficient for the LinearSVR model
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = 0.25

        self.classifier_SVC = None
        self.accuracy_SVC = None
        self.roc_auc_SVC = None
        self.classifier_nu_SVC = None
        self.accuracy_nu_SVC = None
        self.roc_auc_nu_SVC = None
        self.classifier_linear_SVC = None
        self.accuracy_linear_SVC = None

        self.regression_SVR = None
        self.r2_score_SVR = None
        self.r_score_SVR = None
        self.regression_nu_SVR = None
        self.r2_score_nu_SVR = None
        self.r_score_nu_SVR = None
        self.regression_linear_SVR = None
        self.r2_score_linear_SVR = None
        self.r_score_linear_SVR = None

        # References to training and testing subsets of dataset; instance data for re-use purposes
        self.dataset_X_train = None
        self.dataset_y_train = None
        self.dataset_X_test = None
        self.dataset_y_test = None

    # Accessor Methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If a SVM object is initialized without specifying attributes, attributes will be None. No SVM functionality can
        be used until attributes is a populated numpy array. Call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If a SVM object is initialized without specifying labels, labels will be None. No SVM functionality can be used
        until labels is a populated numpy array. Call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_classifier_SVC(self):
        """
        Accessor method for classifier_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.classifier_SVC

    def get_accuracy_SVC(self):
        """
        Accessor method for accuracy_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.accuracy_SVC

    def get_roc_auc_SVC(self):
        """
        Accessor method for roc_auc_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_SVC

    def get_classifier_nu_SVC(self):
        """
        Accessor method for classifier_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.classifier_nu_SVC

    def get_accuracy_nu_SVC(self):
        """
        Accessor method for accuracy_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_nu_SVC

    def get_roc_auc_nu_SVC(self):
        """
        Accessor method for roc_auc_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_nu_SVC

    def get_classifier_linear_SVC(self):
        """
        Accessor method for classifier_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.classifier_linear_SVC

    def get_accuracy_linear_SVC(self):
        """
        Accessor method for accuracy_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_linear_SVC

    def get_regression_SVR(self):
        """
        Accessor method for regression_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.regression_SVR

    def get_r2_score_SVR(self):
        """
        Accessor method for r2_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r2_score_SVR

    def get_r_score_SVR(self):
        """
        Accessor method for r_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r_score_SVR

    def get_regression_nu_SVR(self):
        """
        Accessor method for regression_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.regression_nu_SVR

    def get_r2_score_nu_SVR(self):
        """
        Accessor method for r2_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_nu_SVR

    def get_r_score_nu_SVR(self):
        """
        Accessor method for r_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r_score_nu_SVR

    def get_regression_linear_SVR(self):
        """
        Accessor method for regression_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.regression_linear_SVR

    def get_r2_score_linear_SVR(self):
        """
        Accessor method for r2_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_linear_SVR

    def get_r_score_linear_SVR(self):
        """
        Accessor method for r_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r_score_linear_SVR

    # Modifier Methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a populated numpy array. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a populated numpy array. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a float between 0.0 and 1.0 or None. Defaults to 0.25. The training size will be set to the
        complement of test_size.
        """
        self.test_size = new_test_size

    # Wrappers for SVM classification classes

    def SVC(self,
            C=1.0,
            kernel="rbf",
            degree=3,
            gamma="scale",
            coef0=0.0,
            shrinking=True,
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False,
            max_iter=-1,
            decision_function_shape="ovr",
            break_ties=False,
            random_state=None):
        """
        Wrapper for scikit-learn's C-Support Vector Classification implementation.
        Parameters per scikit-learn's documentation:

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm. The fit time scales at least quadratically with the number of samples
        and may be impractical beyond tens of thousands of samples.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_SVC =\
                SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                    probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose,
                    max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties,
                    random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_SVC.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_SVC = self.classifier_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_SVC = roc_auc_score(
                    self.classifier_SVC.predict(self.dataset_X_test),
                    self.classifier_SVC.predict_proba(self.dataset_X_test)[::,
                                                                           1])

    def nu_SVC(self,
               nu=0.5,
               kernel="rbf",
               degree=3,
               gamma="scale",
               coef0=0.0,
               shrinking=True,
               probability=False,
               tol=0.001,
               cache_size=200,
               class_weight=None,
               verbose=False,
               max_iter=-1,
               decision_function_shape="ovr",
               break_ties=False,
               random_state=None):
        """
        Wrapper for scikit-learn's Nu-Support Vector Classification implementation.
        Per scikit-learn's documentation, NuSVC is similar to SVC, but uses a parameter, nu, to set the number of
        support vectors.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_nu_SVC =\
                NuSVC(nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                      probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight,
                      verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape,
                      break_ties=break_ties, random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_nu_SVC.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_nu_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_nu_SVC = self.classifier_nu_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_nu_SVC = roc_auc_score(
                    self.classifier_nu_SVC.predict(self.dataset_X_test),
                    self.classifier_nu_SVC.predict_proba(
                        self.dataset_X_test)[::, 1])

    def linear_SVC(self,
                   penalty="l2",
                   loss="squared_hinge",
                   dual=True,
                   tol=0.0001,
                   C=1.0,
                   multi_class='ovr',
                   fit_intercept=True,
                   intercept_scaling=1,
                   class_weight=None,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Classification implementation. Per scikit-learn's documentation,
        LinearSVC is similar to SVC with a linear kernel, but implemented with liblinear instead of libsvm, providing
        more flexibility in choice of penalties and loss functions. LinearSVC should also scale better to large sample
        sizes. LinearSVC supports both dense and sparse input, and the multiclass support is handled according to a
        one-vs-the-rest scheme.
        Parameters per scikit-learn's documentation:

            – penalty: Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The
            ‘l1’ leads to coef_ vectors that are sparse. (Default is "l2")

            – loss: Specifies the loss function. ‘hinge’ is the standard SVM loss (used e.g. by the SVC class) while
            ‘squared_hinge’ is the square of the hinge loss. (Default is "squared_hinge")

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)
            
            – tol: Tolerance for stopping criteria. (Default is 1e-4, or 0.0001)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be
            strictly positive. (Default is 1.0)
            
            – multi_class: Determines the multi-class strategy if y contains more than two classes. "ovr" trains
            n_classes one-vs-rest classifiers, while "crammer_singer" optimizes a joint objective over all classes.
            While crammer_singer is interesting from a theoretical perspective as it is consistent, it is seldom used
            in practice as it rarely leads to better accuracy and is more expensive to compute. If "crammer_singer" is
            chosen, the options loss, penalty and dual will be ignored. (Default is "ovr")
            
            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_linear_SVC =\
                LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol, C=C, multi_class=multi_class,
                          fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
                          verbose=verbose, random_state=random_state, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_linear_SVC.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_linear_SVC = None
                return

            # Evaluate accuracy of model using testing set and actual classification
            self.accuracy_linear_SVC = self.classifier_linear_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

    # Wrappers for SVM regression classes

    def SVR(self,
            kernel='rbf',
            degree=3,
            gamma='scale',
            coef0=0.0,
            tol=0.001,
            C=1.0,
            epsilon=0.1,
            shrinking=True,
            cache_size=200,
            verbose=False,
            max_iter=-1):
        """
        Wrapper for scikit-learn's Epsilon-Support Vector Regression implementation. Per scikit-learn's documentation,
        this implementation is based on libsvm. Scaling to tens of thousands of samples is difficult, as the fit time
        complexity is more than quadratic with the number of samples. For large datasets, consider using LinearSVR by
        calling linear_SVR().
        Parameters per scikit-learn's documentation:

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – shrinking: Whether to use the shrinking heuristic. (Default is True)

            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)

            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_SVR =\
                SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon,
                    shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_SVR.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_SVR = self.regression_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_SVR = sqrt(self.r2_score_SVR)

    def nu_SVR(self,
               nu=0.5,
               C=1.0,
               kernel='rbf',
               degree=3,
               gamma='scale',
               coef0=0.0,
               shrinking=True,
               tol=0.001,
               cache_size=200,
               verbose=False,
               max_iter=-1):
        """
        Wrapper for scikit-learn's Nu Support Vector Regression implementation. Per scikit-learn's documentation,
        NuSVR uses the parameter nu to control the number of support vectors, similar to NuSVC. Yet unlike NuSVC,
        nu replaces the parameter epsilon of epsilon-SVR, not C. This implementation is based on libsvm.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
                        
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_nu_SVR =\
                NuSVR(nu=nu, C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, tol=tol,
                      cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_nu_SVR.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_nu_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_nu_SVR = self.regression_nu_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_nu_SVR = sqrt(self.r2_score_nu_SVR)

    def linear_SVR(self,
                   epsilon=0.0,
                   tol=0.0001,
                   C=1.0,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Regression implementation. Per scikit-learn's documentation,
        LinearSVR is similar to SVR with a linear kernel, but is implemented with liblinear instead of libsvm. This
        provides greater flexibility in choice of penalties and loss functions, and should scale better to large sample
        sizes. LinearSVM supports both dense and sparse input.
        Parameters per scikit-learn's documentation:

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – loss: Specifies the loss function. The epsilon-insensitive loss (standard SVR) is the L1 loss, while the
            squared epsilon-insensitive loss (‘squared_epsilon_insensitive’) is the L2 loss.
            (Default is "epsilon_insensitive")

            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_linear_SVR =\
                LinearSVR(epsilon=epsilon, tol=tol, C=C, loss=loss, fit_intercept=fit_intercept,
                          intercept_scaling=intercept_scaling, dual=dual, verbose=verbose, random_state=random_state,
                          max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or labels isn't
            # quantitative data
            try:
                self.regression_linear_SVR.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_linear_SVR = None
                return

            # Get coefficient of determination and correlation coefficient for model
            self.r2_score_linear_SVR = self.regression_linear_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_linear_SVR = sqrt(self.r2_score_linear_SVR)

    # Helper methods

    def _split_data(self):
        """
        Helper method for splitting attributes and labels into training and testing sets.

        This method runs under the assumption that all relevant instance data has been checked for correctness.
        """

        self.dataset_X_train, self.dataset_X_test, self.dataset_y_train, self.dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

    def _check_inputs(self):
        """
        Verifies if instance data is ready for use in SVM model.
        """

        # Check if attributes exists
        if self.attributes is None:
            print(
                "attributes is missing; call set_attributes(new_attributes) to fix this! new_attributes should be a",
                "populated dataset of independent variables.")
            return False

        # Check if labels exists
        if self.labels is None:
            print(
                "labels is missing; call set_labels(new_labels) to fix this! new_labels should be a populated dataset",
                "of classes.")
            return False

        # Check if attributes and labels have same number of rows (samples)
        if self.attributes.shape[0] != self.labels.shape[0]:
            print(
                "attributes and labels don't have the same number of rows. Make sure the number of samples in each",
                "dataset matches!")
            return False

        # Check if test_size is a number
        if self.test_size is not None and not isinstance(
                self.test_size, (int, float)):
            print(
                "test_size must be None or a number; call set_test_size(new_test_size) to fix this!"
            )
            return False

        return True
예제 #11
0
x = min_max_scaler.fit_transform(x)
y = min_max_scaler.fit_transform(y)
y = y.ravel()  #改为列向量
'''
十次随机验证
'''
scores1 = []
scores2 = []
for i in range(10):
    x_t, x_v, y_t, y_v = train_test_split(x, y, test_size=0.2)
    svr1 = SVR()
    svr2 = NuSVR()
    svr1.fit(x_t, y_t)
    svr2.fit(x_t, y_t)
    score1 = svr1.score(x_v, y_v)
    score2 = svr2.score(x_v, y_v)
    scores1.append(round(score1, 2))
    scores2.append(round(score2, 2))
print('svr十次r方为:\n', scores1, '\nnusvr十次r方为:\n', scores2)
score1_m = np.mean(scores1)
score2_m = np.mean(scores2)
print('{:.2f},{:.2f}'.format(score1_m, score2_m))

#x_t,x_v,y_t,y_v=train_test_split(x,y,test_size=0.2)
#svr = GridSearchCV(SVR(), param_grid={"kernel": ("poly", 'rbf'),\
#      "C": np.logspace(1,20, 5), "gamma": np.logspace(0, 1, 5)},scoring='r2')

#svr.fit(x_t,y_t)
#svr=SVR()
#scores = cross_val_score(svr, x, y, cv=5, scoring='r2')
#print(scores)