Exemplo n.º 1
0
    def fit_model_7(self,toWrite=False):
        model = NuSVC(probability=True,kernel='linear')

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 7 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model7/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Exemplo n.º 2
0
def svm(train_feature, train_label):
    clf = NuSVC(kernel='rbf', gamma='scale', probability=True)
    clf.fit(train_feature, train_label)
    pre_score = clf.predict_proba(train_feature)
    _pre_label = []
    _pre_score = []
    for item in pre_score:
        if item[0] > item[1]:
            _pre_label.append('0')
        else:
            _pre_label.append('1')
        _pre_score.append(item[1])
    print ('The auc is: {}'.format(roc_auc_score(train_label,_pre_score)))
    return clf.score(train_feature,train_label),_pre_score
Exemplo n.º 3
0
def NuSVMClass(TrainX, TrainY):
    global NClass, Param

    # CV Data Spliting
    SubTrainX, SubTestsX, SubTrainY, SubTestsY = train_test_split(
        TrainX, TrainY, test_size=Param['SVM_CVP'])

    # Cross validation to select the best C
    CandidateNu = Param['Nu']
    Score = np.zeros(len(CandidateNu))
    for i in range(0, len(CandidateNu)):
        NuSupport = NuSVC(nu=CandidateNu[i],
                          kernel='rbf',
                          degree=3,
                          gamma=0.5,
                          shrinking=False,
                          probability=True,
                          tol=0.001,
                          cache_size=1000,
                          verbose=0)
        NuSupport.fit(SubTrainX, SubTrainY)
        TestOutput = NuSupport.predict_proba(SubTestsX)
        Prediction = PostProcessing(TestOutput, NuSupport)

        Score[i] = PerMeasure(SubTestsY, Prediction)
    BestCIndex = np.argmin(Score)
    BestNu = CandidateNu[BestCIndex]
    BestPerformance = np.min(Score)

    print "BestNu: ", BestNu, " Best Performance: ", BestPerformance

    # Final Prediction
    NuSupport = NuSVC(nu=BestNu,
                      kernel='rbf',
                      degree=3,
                      gamma=0.5,
                      shrinking=False,
                      probability=True,
                      tol=0.001,
                      cache_size=1000,
                      verbose=0)
    return NuSupport
Exemplo n.º 4
0
Arquivo: nu_svc.py Projeto: lnxpy/lale
class NuSVCImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Exemplo n.º 5
0
    lr.fit(model_mat_train[:, np.where(rfe.support_)[0]], ACTION)
    pred = lr.predict_proba(model_mat_test[:, np.where(rfe.support_)[0]])
    pd.DataFrame({"Id": test_data.index, "Action": pred[:, 1]}).to_csv(
        "../lr2_submission.csv", header=True, index=False
    )

    ## svms
    svc = SVC(C=1.0, kernel="rbf", probability=True, class_weight="auto", verbose=2)
    svc.fit(model_mat_train[:27000, np.where(rfe.support_)[0]], ACTION[:27000])
    pred = svc.predict_proba(model_mat_train[27000:, np.where(rfe.support_)[0]])
    auc_score(ACTION[27000:], pred[:, 1])

    nusvc = NuSVC(nu=0.11, kernel="rbf", degree=3, probability=True, cache_size=1024, verbose=2)
    nusvc.fit(model_mat_train[:27000, np.where(rfe.support_)[0]], ACTION[:27000])
    svc_pred = nusvc.predict_proba(model_mat_train[27000:, np.where(rfe.support_)[0]])
    auc_score(ACTION[27000:], svc_pred[:, 1])

    nusvc = NuSVC(nu=0.11, kernel="rbf", degree=3, probability=True, cache_size=1024, verbose=2)
    nusvc.fit(model_mat_train[:27000], ACTION[:27000])
    svc_pred = nusvc.predict_proba(model_mat_train[27000:])
    auc_score(ACTION[27000:], svc_pred[:, 1])

    nusvc.fit(model_mat_train[:, np.where(rfe.support_)[0]], ACTION)
    svc_pred = nusvc.predict_proba(model_mat_test[:, np.where(rfe.support_)[0]])
    pd.DataFrame({"Id": test_data.index, "Action": svc_pred[:, 1]}).to_csv(
        "../nusvc_submission.csv", header=True, index=False
    )

    ## random forest
Exemplo n.º 6
0
    def _evaluation_test_helper(self,
                                class_labels,
                                use_probability_estimates,
                                allow_slow,
                                allowed_prob_delta=0.00001):
        # Parameters to test
        kernel_parameters = [{}, {
            'kernel': 'rbf',
            'gamma': 1.2
        }, {
            'kernel': 'linear'
        }, {
            'kernel': 'poly'
        }, {
            'kernel': 'poly',
            'degree': 2
        }, {
            'kernel': 'poly',
            'gamma': 0.75
        }, {
            'kernel': 'poly',
            'degree': 0,
            'gamma': 0.9,
            'coef0': 2
        }, {
            'kernel': 'sigmoid'
        }, {
            'kernel': 'sigmoid',
            'gamma': 1.3
        }, {
            'kernel': 'sigmoid',
            'coef0': 0.8
        }, {
            'kernel': 'sigmoid',
            'coef0': 0.8,
            'gamma': 0.5
        }]
        non_kernel_parameters = [{}, {
            'nu': 0.75
        }, {
            'nu': 0.25,
            'shrinking': True
        }, {
            'shrinking': False
        }]

        # Generate some random data
        x, y = [], []
        random.seed(42)
        for _ in range(50):
            x.append([
                random.gauss(200, 30),
                random.gauss(-100, 22),
                random.gauss(100, 42)
            ])
            y.append(random.choice(class_labels))
        column_names = ['x1', 'x2', 'x3']
        # make sure first label is seen first, second is seen second, and so on.
        for i, val in enumerate(class_labels):
            y[i] = val
        df = pd.DataFrame(x, columns=column_names)

        # Test
        for param1 in non_kernel_parameters:
            for param2 in kernel_parameters:
                cur_params = param1.copy()
                cur_params.update(param2)
                cur_params['probability'] = use_probability_estimates
                cur_params['max_iter'] = 10  # Don't want test to take too long
                # print("cur_params=" + str(cur_params))

                cur_model = NuSVC(**cur_params)
                cur_model.fit(x, y)

                spec = scikit_converter.convert(cur_model, column_names,
                                                'target')

                if macos_version() >= (10, 13):
                    if use_probability_estimates:
                        probability_lists = cur_model.predict_proba(x)
                        df['classProbability'] = [
                            dict(zip(cur_model.classes_, cur_vals))
                            for cur_vals in probability_lists
                        ]
                        metrics = evaluate_classifier_with_probabilities(
                            spec, df, probabilities='classProbability')
                        self.assertEquals(metrics['num_key_mismatch'], 0)
                        self.assertLess(metrics['max_probability_error'],
                                        allowed_prob_delta)
                    else:
                        df['prediction'] = cur_model.predict(x)
                        metrics = evaluate_classifier(spec, df, verbose=False)
                        self.assertEquals(metrics['num_errors'], 0)

                if not allow_slow:
                    break

            if not allow_slow:
                break
Exemplo n.º 7
0
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)

    data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
    data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=4).fit_transform(data[cols]))
    train3 = data2[:train2.shape[0]]; test3 = data2[train2.shape[0]:]
    
    # STRATIFIED K FOLD (Using splits=25 scores 0.002 better but is slower)
    skf = StratifiedKFold(n_splits=5, random_state=42)
    for train_index, test_index in skf.split(train2, train2['target']):

        clf = NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=4, nu=0.59, coef0=0.053)
        clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
        oof[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
        preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = neighbors.KNeighborsClassifier(n_neighbors=17, p=2.9)
        clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
        oof_2[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
        preds_2[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
    #if i%15==0: print(i)
        
print(roc_auc_score(train['target'], oof))
print(roc_auc_score(train['target'], oof_2))
print(roc_auc_score(train['target'], 0.8*oof+0.2*oof_2))
print(roc_auc_score(train['target'], 0.95*oof+0.05*oof_2))
print(roc_auc_score(train['target'], 1.05*oof-0.05*oof_2))
plt.figure()
plt.plot(fpr,tpr,label=f'Model10 (area={model10_roc})')
plt.plot([0,1],[0,1])
plt.legend()
plt.show()

# Model 11: Nu SVC

model11 = NuSVC(nu=0.2,probability=True,gamma='scale')
res11 = model11.fit(X_train,y_train)
pred11 = model11.predict(X_test)
conf11 = confusion_matrix(y_test,pred11)
conf11

model11_roc=roc_auc_score(y_test,pred11)
fpr,tpr,thresholds=roc_curve(y_test, model11.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr,tpr,label=f'Model11 (area={model11_roc})')
plt.plot([0,1],[0,1])
plt.legend()
plt.show()


## Compare models
## OBJECTIVE: MINIMIZE FN (Type II Error)

lst=[]
for i in range(1,13):
    FP = eval(f'conf{i}')[0][1]
    FN = eval(f'conf{i}')[1][0]
    TP = eval(f'conf{i}')[1][1]
Exemplo n.º 9
0
    def _evaluation_test_helper(
        self,
        class_labels,
        use_probability_estimates,
        allow_slow,
        allowed_prob_delta=0.00001,
    ):
        # Parameters to test
        kernel_parameters = [
            {},
            {"kernel": "rbf", "gamma": 1.2},
            {"kernel": "linear"},
            {"kernel": "poly"},
            {"kernel": "poly", "degree": 2},
            {"kernel": "poly", "gamma": 0.75},
            {"kernel": "poly", "degree": 0, "gamma": 0.9, "coef0": 2},
            {"kernel": "sigmoid"},
            {"kernel": "sigmoid", "gamma": 1.3},
            {"kernel": "sigmoid", "coef0": 0.8},
            {"kernel": "sigmoid", "coef0": 0.8, "gamma": 0.5},
        ]
        non_kernel_parameters = [
            {},
            {"nu": 0.75},
            {"nu": 0.25, "shrinking": True},
            {"shrinking": False},
        ]

        # Generate some random data
        x, y = [], []
        random.seed(42)
        for _ in range(50):
            x.append(
                [random.gauss(200, 30), random.gauss(-100, 22), random.gauss(100, 42)]
            )
            y.append(random.choice(class_labels))
        column_names = ["x1", "x2", "x3"]
        # make sure first label is seen first, second is seen second, and so on.
        for i, val in enumerate(class_labels):
            y[i] = val
        df = pd.DataFrame(x, columns=column_names)

        # Test
        for param1 in non_kernel_parameters:
            for param2 in kernel_parameters:
                cur_params = param1.copy()
                cur_params.update(param2)
                cur_params["probability"] = use_probability_estimates
                cur_params["max_iter"] = 10  # Don't want test to take too long
                # print("cur_params=" + str(cur_params))

                cur_model = NuSVC(**cur_params)
                cur_model.fit(x, y)

                spec = scikit_converter.convert(cur_model, column_names, "target")

                if _is_macos() and _macos_version() >= (10, 13):
                    if use_probability_estimates:
                        probability_lists = cur_model.predict_proba(x)
                        df["classProbability"] = [
                            dict(zip(cur_model.classes_, cur_vals))
                            for cur_vals in probability_lists
                        ]
                        metrics = evaluate_classifier_with_probabilities(
                            spec, df, probabilities="classProbability"
                        )
                        self.assertEqual(metrics["num_key_mismatch"], 0)
                        self.assertLess(
                            metrics["max_probability_error"], allowed_prob_delta
                        )
                    else:
                        df["prediction"] = cur_model.predict(x)
                        metrics = evaluate_classifier(spec, df, verbose=False)
                        self.assertEqual(metrics["num_errors"], 0)

                if not allow_slow:
                    break

            if not allow_slow:
                break
Exemplo n.º 10
0
class SVM:
    """
    Wrapper class around scikit-learn's support vector machine functionality.
    This class supports binary and multi-class classification on a dataset, along with regression via Support Vector
    Regression (SVR).
    Per scikit-learn's documentation:

    Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and
    outliers detection.

    The advantages of support vector machines are:

        – Effective in high dimensional spaces.
        – Still effective in cases where number of dimensions is greater than the number of samples.
        – Uses a subset of training points in the decision function (called support vectors), so it is also memory
        efficient.
        – Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided,
        but it is also possible to specify custom kernels.

    The disadvantages of support vector machines include:

        – If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel
        functions and regularization term is crucial.
        – SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold
        cross-validation.
    """
    def __init__(self, attributes=None, labels=None, test_size=0.25):
        """
        Initializes a SVM object.

        The following parameters are needed to use a SVM:

            – attributes: a numpy array of the independent variables
            – labels: a numpy array of the classes (for classification) or dependent variables (for regression)
            – test_size: the proportion of the dataset to be used for testing the model (defaults to 0.25);
            the proportion of the dataset to be used for training will be the complement of test_size

        After successfully running one of the classifier methods (SVC(), nu_SVC(), or linear_SVC()), the corresponding
        classifier below will be trained:

            – classifier_SVC: a classifier trained using scikit-learn's SVC implementation
            – accuracy_SVC: the accuracy of the SVC model, based on its predictions for dataset_X_test
            – roc_auc_SVC: the area under the ROC curve for the SVC model
            – classifier_nu_SVC: a classifier trained using scikit-learn's NuSVC implementation
            – accuracy_nu_SVC: the accuracy of the NuSVC model, based on its predictions for dataset_X_test
            – roc_auc_nu_SVC: the area under the ROC curve for the NuSVC model
            – classifier_linear_SVC: a classifier trained using scikit-learn's LinearSVC implementation
            – accuracy_linear_SVC: the accuracy of the LinearSVC model, based on its predictions for dataset_X_test

        After successfully running one of the regression methods (SVR(), nu_SVR(), or linear_SVR()), the corresponding
        regression model below will be trained:

            – regression_SVR: a regression model trained using scikit-learn's SVR implementation
            – r2_score_SVR: the coefficient of determination for the SVR model
            – r_score_SVR: the correlation coefficient for the SVR model
            – regression_nu_SVR: a regression model trained using scikit-learn's NuSVR implementation
            – r2_score_nu_SVR: the coefficient of determination for the NuSVR model
            – r_score_nu_SVR: the correlation coefficient for the NuSVR model
            – regression_linear_SVR: a regression model trained using scikit-learn's LinearSVR implementation
            – r2_score_linear_SVR: the coefficient of determination for the LinearSVR model
            – r_score_linear_SVR: the correlation coefficient for the LinearSVR model
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = 0.25

        self.classifier_SVC = None
        self.accuracy_SVC = None
        self.roc_auc_SVC = None
        self.classifier_nu_SVC = None
        self.accuracy_nu_SVC = None
        self.roc_auc_nu_SVC = None
        self.classifier_linear_SVC = None
        self.accuracy_linear_SVC = None

        self.regression_SVR = None
        self.r2_score_SVR = None
        self.r_score_SVR = None
        self.regression_nu_SVR = None
        self.r2_score_nu_SVR = None
        self.r_score_nu_SVR = None
        self.regression_linear_SVR = None
        self.r2_score_linear_SVR = None
        self.r_score_linear_SVR = None

        # References to training and testing subsets of dataset; instance data for re-use purposes
        self.dataset_X_train = None
        self.dataset_y_train = None
        self.dataset_X_test = None
        self.dataset_y_test = None

    # Accessor Methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If a SVM object is initialized without specifying attributes, attributes will be None. No SVM functionality can
        be used until attributes is a populated numpy array. Call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If a SVM object is initialized without specifying labels, labels will be None. No SVM functionality can be used
        until labels is a populated numpy array. Call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_classifier_SVC(self):
        """
        Accessor method for classifier_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.classifier_SVC

    def get_accuracy_SVC(self):
        """
        Accessor method for accuracy_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.accuracy_SVC

    def get_roc_auc_SVC(self):
        """
        Accessor method for roc_auc_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_SVC

    def get_classifier_nu_SVC(self):
        """
        Accessor method for classifier_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.classifier_nu_SVC

    def get_accuracy_nu_SVC(self):
        """
        Accessor method for accuracy_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_nu_SVC

    def get_roc_auc_nu_SVC(self):
        """
        Accessor method for roc_auc_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_nu_SVC

    def get_classifier_linear_SVC(self):
        """
        Accessor method for classifier_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.classifier_linear_SVC

    def get_accuracy_linear_SVC(self):
        """
        Accessor method for accuracy_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_linear_SVC

    def get_regression_SVR(self):
        """
        Accessor method for regression_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.regression_SVR

    def get_r2_score_SVR(self):
        """
        Accessor method for r2_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r2_score_SVR

    def get_r_score_SVR(self):
        """
        Accessor method for r_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r_score_SVR

    def get_regression_nu_SVR(self):
        """
        Accessor method for regression_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.regression_nu_SVR

    def get_r2_score_nu_SVR(self):
        """
        Accessor method for r2_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_nu_SVR

    def get_r_score_nu_SVR(self):
        """
        Accessor method for r_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r_score_nu_SVR

    def get_regression_linear_SVR(self):
        """
        Accessor method for regression_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.regression_linear_SVR

    def get_r2_score_linear_SVR(self):
        """
        Accessor method for r2_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_linear_SVR

    def get_r_score_linear_SVR(self):
        """
        Accessor method for r_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r_score_linear_SVR

    # Modifier Methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a populated numpy array. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a populated numpy array. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a float between 0.0 and 1.0 or None. Defaults to 0.25. The training size will be set to the
        complement of test_size.
        """
        self.test_size = new_test_size

    # Wrappers for SVM classification classes

    def SVC(self,
            C=1.0,
            kernel="rbf",
            degree=3,
            gamma="scale",
            coef0=0.0,
            shrinking=True,
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False,
            max_iter=-1,
            decision_function_shape="ovr",
            break_ties=False,
            random_state=None):
        """
        Wrapper for scikit-learn's C-Support Vector Classification implementation.
        Parameters per scikit-learn's documentation:

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm. The fit time scales at least quadratically with the number of samples
        and may be impractical beyond tens of thousands of samples.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_SVC =\
                SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                    probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose,
                    max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties,
                    random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_SVC.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_SVC = self.classifier_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_SVC = roc_auc_score(
                    self.classifier_SVC.predict(self.dataset_X_test),
                    self.classifier_SVC.predict_proba(self.dataset_X_test)[::,
                                                                           1])

    def nu_SVC(self,
               nu=0.5,
               kernel="rbf",
               degree=3,
               gamma="scale",
               coef0=0.0,
               shrinking=True,
               probability=False,
               tol=0.001,
               cache_size=200,
               class_weight=None,
               verbose=False,
               max_iter=-1,
               decision_function_shape="ovr",
               break_ties=False,
               random_state=None):
        """
        Wrapper for scikit-learn's Nu-Support Vector Classification implementation.
        Per scikit-learn's documentation, NuSVC is similar to SVC, but uses a parameter, nu, to set the number of
        support vectors.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_nu_SVC =\
                NuSVC(nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                      probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight,
                      verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape,
                      break_ties=break_ties, random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_nu_SVC.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_nu_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_nu_SVC = self.classifier_nu_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_nu_SVC = roc_auc_score(
                    self.classifier_nu_SVC.predict(self.dataset_X_test),
                    self.classifier_nu_SVC.predict_proba(
                        self.dataset_X_test)[::, 1])

    def linear_SVC(self,
                   penalty="l2",
                   loss="squared_hinge",
                   dual=True,
                   tol=0.0001,
                   C=1.0,
                   multi_class='ovr',
                   fit_intercept=True,
                   intercept_scaling=1,
                   class_weight=None,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Classification implementation. Per scikit-learn's documentation,
        LinearSVC is similar to SVC with a linear kernel, but implemented with liblinear instead of libsvm, providing
        more flexibility in choice of penalties and loss functions. LinearSVC should also scale better to large sample
        sizes. LinearSVC supports both dense and sparse input, and the multiclass support is handled according to a
        one-vs-the-rest scheme.
        Parameters per scikit-learn's documentation:

            – penalty: Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The
            ‘l1’ leads to coef_ vectors that are sparse. (Default is "l2")

            – loss: Specifies the loss function. ‘hinge’ is the standard SVM loss (used e.g. by the SVC class) while
            ‘squared_hinge’ is the square of the hinge loss. (Default is "squared_hinge")

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)
            
            – tol: Tolerance for stopping criteria. (Default is 1e-4, or 0.0001)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be
            strictly positive. (Default is 1.0)
            
            – multi_class: Determines the multi-class strategy if y contains more than two classes. "ovr" trains
            n_classes one-vs-rest classifiers, while "crammer_singer" optimizes a joint objective over all classes.
            While crammer_singer is interesting from a theoretical perspective as it is consistent, it is seldom used
            in practice as it rarely leads to better accuracy and is more expensive to compute. If "crammer_singer" is
            chosen, the options loss, penalty and dual will be ignored. (Default is "ovr")
            
            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_linear_SVC =\
                LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol, C=C, multi_class=multi_class,
                          fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
                          verbose=verbose, random_state=random_state, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_linear_SVC.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_linear_SVC = None
                return

            # Evaluate accuracy of model using testing set and actual classification
            self.accuracy_linear_SVC = self.classifier_linear_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

    # Wrappers for SVM regression classes

    def SVR(self,
            kernel='rbf',
            degree=3,
            gamma='scale',
            coef0=0.0,
            tol=0.001,
            C=1.0,
            epsilon=0.1,
            shrinking=True,
            cache_size=200,
            verbose=False,
            max_iter=-1):
        """
        Wrapper for scikit-learn's Epsilon-Support Vector Regression implementation. Per scikit-learn's documentation,
        this implementation is based on libsvm. Scaling to tens of thousands of samples is difficult, as the fit time
        complexity is more than quadratic with the number of samples. For large datasets, consider using LinearSVR by
        calling linear_SVR().
        Parameters per scikit-learn's documentation:

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – shrinking: Whether to use the shrinking heuristic. (Default is True)

            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)

            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_SVR =\
                SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon,
                    shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_SVR.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_SVR = self.regression_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_SVR = sqrt(self.r2_score_SVR)

    def nu_SVR(self,
               nu=0.5,
               C=1.0,
               kernel='rbf',
               degree=3,
               gamma='scale',
               coef0=0.0,
               shrinking=True,
               tol=0.001,
               cache_size=200,
               verbose=False,
               max_iter=-1):
        """
        Wrapper for scikit-learn's Nu Support Vector Regression implementation. Per scikit-learn's documentation,
        NuSVR uses the parameter nu to control the number of support vectors, similar to NuSVC. Yet unlike NuSVC,
        nu replaces the parameter epsilon of epsilon-SVR, not C. This implementation is based on libsvm.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
                        
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_nu_SVR =\
                NuSVR(nu=nu, C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, tol=tol,
                      cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_nu_SVR.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_nu_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_nu_SVR = self.regression_nu_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_nu_SVR = sqrt(self.r2_score_nu_SVR)

    def linear_SVR(self,
                   epsilon=0.0,
                   tol=0.0001,
                   C=1.0,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Regression implementation. Per scikit-learn's documentation,
        LinearSVR is similar to SVR with a linear kernel, but is implemented with liblinear instead of libsvm. This
        provides greater flexibility in choice of penalties and loss functions, and should scale better to large sample
        sizes. LinearSVM supports both dense and sparse input.
        Parameters per scikit-learn's documentation:

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – loss: Specifies the loss function. The epsilon-insensitive loss (standard SVR) is the L1 loss, while the
            squared epsilon-insensitive loss (‘squared_epsilon_insensitive’) is the L2 loss.
            (Default is "epsilon_insensitive")

            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_linear_SVR =\
                LinearSVR(epsilon=epsilon, tol=tol, C=C, loss=loss, fit_intercept=fit_intercept,
                          intercept_scaling=intercept_scaling, dual=dual, verbose=verbose, random_state=random_state,
                          max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or labels isn't
            # quantitative data
            try:
                self.regression_linear_SVR.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_linear_SVR = None
                return

            # Get coefficient of determination and correlation coefficient for model
            self.r2_score_linear_SVR = self.regression_linear_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_linear_SVR = sqrt(self.r2_score_linear_SVR)

    # Helper methods

    def _split_data(self):
        """
        Helper method for splitting attributes and labels into training and testing sets.

        This method runs under the assumption that all relevant instance data has been checked for correctness.
        """

        self.dataset_X_train, self.dataset_X_test, self.dataset_y_train, self.dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

    def _check_inputs(self):
        """
        Verifies if instance data is ready for use in SVM model.
        """

        # Check if attributes exists
        if self.attributes is None:
            print(
                "attributes is missing; call set_attributes(new_attributes) to fix this! new_attributes should be a",
                "populated dataset of independent variables.")
            return False

        # Check if labels exists
        if self.labels is None:
            print(
                "labels is missing; call set_labels(new_labels) to fix this! new_labels should be a populated dataset",
                "of classes.")
            return False

        # Check if attributes and labels have same number of rows (samples)
        if self.attributes.shape[0] != self.labels.shape[0]:
            print(
                "attributes and labels don't have the same number of rows. Make sure the number of samples in each",
                "dataset matches!")
            return False

        # Check if test_size is a number
        if self.test_size is not None and not isinstance(
                self.test_size, (int, float)):
            print(
                "test_size must be None or a number; call set_test_size(new_test_size) to fix this!"
            )
            return False

        return True
Exemplo n.º 11
0
    train4 = data2[:train2.shape[0]]
    test4 = data2[train2.shape[0]:]

    # STRATIFIED K FOLD (Using splits=25 scores 0.002 better but is slower)
    skf = StratifiedKFold(n_splits=5, random_state=42)
    for train_index, test_index in skf.split(train2, train2['target']):

        clf = NuSVC(probability=True,
                    kernel='poly',
                    degree=4,
                    gamma='auto',
                    random_state=4,
                    nu=0.59,
                    coef0=0.053)
        clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
        oof_svnu[idx1[test_index]] = clf.predict_proba(
            train3[test_index, :])[:, 1]
        pred_te_svnu[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

        clf = neighbors.KNeighborsClassifier(n_neighbors=17, p=2.9)
        clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
        oof_knn[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                             1]
        pred_te_knn[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

        clf = linear_model.LogisticRegression(solver='saga',
                                              penalty='l1',
                                              C=0.1)
        clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
        oof_lr[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                            1]
        pred_te_lr[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits
Exemplo n.º 12
0
def train(obj,
          training_errors,
          validation_errors,
          epochs,
          batch_size,
          valid_size,
          batchs_num,
          validation_size,
          train_size,
          model_type,
          windowSize,
          decisionValue,
          noiseUpdate=0.05,
          nu=0.9,
          gamma=0.005,
          kernel="rbf",
          C=0.8):
    # cat = 0, dog = 1
    np.random.seed(42)
    if model_type == "svm":
        model = NuSVC(kernel=kernel, nu=nu, gamma=gamma)
        modelTemp = NuSVC(kernel=kernel, nu=nu, gamma=gamma)
    else:
        model = LogisticRegression(solver='sag',
                                   penalty="l2",
                                   C=C,
                                   warm_start=True,
                                   max_iter=400,
                                   n_jobs=2)  #default parameters
        modelTemp = LogisticRegression(solver='sag',
                                       penalty="l2",
                                       C=C,
                                       warm_start=True,
                                       max_iter=400,
                                       n_jobs=2)  #default parameters
    counter = 0
    prevError = np.finfo(np.float32).min

    for epoch in range(0, epochs):
        training_error_avg = []
        #Clear clusters at each epoch.
        obj.resetCounters()
        #obj.bowKmeansClusters.clear()
        #obj.bowExtractor.setVocabulary(np.array([]))
        for batch in range(0, batchs_num):
            #transform = PolynomialFeatures(degrees, interaction_only=True)
            des, labels = obj.loadDescriptorsAndLabels(int(
                batch_size / 2), windowSize)  #batch_sizex128
            des, labels = obj.getBOW(des, labels, False)
            #des = transform.fit_transform(des)
            indeces = np.arange(len(des))
            np.random.shuffle(indeces)
            if modelType == "svm":
                modelTemp.fit(des[indeces], labels[indeces])
                y_pred = modelTemp.predict(des[indeces])
            else:
                modelTemp.fit(des[indeces], labels[indeces])
                #y_pred = model.predict(des)
                y_pred = list(
                    map(lambda v: 0 if v[0] > decisionValue else 1,
                        modelTemp.predict_proba(des[indeces])))
            try:
                cnf = confusion_matrix(y_pred, labels[indeces])
                training_error_avg.append(np.sum(np.diag(cnf)) / np.sum(cnf))
                #training_error_avg.append(log_loss(labels[indeces], y_pred))
                print(
                    f"{epoch}-{batch} the accuracy is {np.sum(np.diag(cnf))/np.sum(cnf)}"
                )
                print(cnf)

            except Exception as e:
                print(e)
                pass

            if training_error_avg[-1] + np.random.normal(
                    0) * noiseUpdate > prevError:
                print("Update model")
                model = modelTemp
                prevError = training_error_avg[-1]
            else:
                print("Previous model is better")
                counter += 1

            if counter > 15:
                print(f"No update for the parameter for {counter} passes")
                break

        if batchs_num * batch_size < train_size - 1 and counter <= 15:
            # transform = PolynomialFeatures(degrees, interaction_only=True)
            des, labels = obj.loadDescriptorsAndLabels(
                int(train_size - batchs_num * batch_size), windowSize)
            des, labels = obj.getBOW(des, labels, False)
            indeces = np.arange(len(des))
            np.random.shuffle(indeces)
            if modelType == "svm":
                #des = transform.fit_transform(des)
                modelTemp.fit(des[indeces], labels[indeces])
                y_pred = modelTemp.predict(des[indeces])
            else:
                modelTemp.fit(des[indeces], labels[indeces])
                y_pred = list(
                    map(lambda v: 0 if v[0] > decisionValue else 1,
                        modelTemp.predict_proba(des[indeces])))

            try:
                cnf = confusion_matrix(y_pred, labels[indeces])
                training_error_avg.append(np.sum(np.diag(cnf)) / np.sum(cnf))
                print(f"Last accuracy is {np.sum(np.diag(cnf))/np.sum(cnf)}")
                print(cnf)

            except Exception as e:
                print(e)
                pass

            if training_error_avg[-1] > prevError:
                print("Update support Vectors")
                #model = modelTemp
                prevError = training_error_avg[-1]
        training_errors.append(np.mean(training_error_avg))

        #For validation set
        labels = []
        y_pred = []
        for valid_batch in range(0, int(validation_size / valid_size)):
            des, labels_r = obj.loadDescriptorsAndLabels(
                valid_size, windowSize)  #numb_key_ptx128
            des, labels_r = obj.getBOW(des, labels_r, False)
            #transform = PolynomialFeatures(degrees, interaction_only=True)
            #des = transform.fit_transform(des[:])
            labels.extend(labels_r)
            if modelType == "logistic":
                y_pred_r = list(
                    map(lambda v: 0 if v[0] > decisionValue else 1,
                        model.predict_proba(des)))
            else:
                y_pred_r = model.predict(des)

            y_pred.extend(y_pred_r)
        if len(y_pred) < validation_size:
            des, labels_r = obj.loadDescriptorsAndLabels(
                valid_size, windowSize)  #numb_key_ptx128
            des, labels_r = obj.getBOW(des, labels_r, False)
            #transform = PolynomialFeatures(degrees, interaction_only=True)
            #des = transform.fit_transform(des[:])
            labels.extend(labels_r)
            if modelType == "logistic":
                y_pred_r = list(
                    map(lambda v: 0 if v[0] > decisionValue else 1,
                        model.predict_proba(des)))
            else:
                y_pred_r = model.predict(des)
            y_pred.extend(y_pred_r)
        try:
            cnf = confusion_matrix(y_pred, labels)
            validation_errors.append(np.sum(np.diag(cnf)) / np.sum(cnf))
            print(cnf)
            print(
                f"{epoch} the validation accuracy is {np.sum(np.diag(cnf))/np.sum(cnf)}"
            )

        except Exception as e:
            print(e)
            pass

        if epoch % 2 == 0:
            print(
                f"#epoch: {epoch} and the accuracy training is {training_errors[epoch]}"
            )
            print(
                f"#epoch: {epoch} and the accuracy validation is {validation_errors[epoch]}"
            )

    return model, training_errors, validation_errors
class AllClassificationModels:
    """
    Wrapper class around all supported classification models: LogisticRegression, MLPClassifier, RandomForest, SVC,
    NuSVC, LinearSVC, and XGBClassifier.
    AllClassificationModels runs every available classification algorithm on the given dataset and outputs the mean
    accuracy, ROC-AUC, and execution time of each successful model when all_classification_models() is run.
    """
    def __init__(self,
                 attributes=None,
                 labels=None,
                 test_size=0.25,
                 verbose=False):
        """
        Initializes an AllClassificationModels object.

        The following parameters are needed to use an AllClassificationModels object:

            – attributes: a numpy array of the desired independent variables (Default is None)
            – labels: a numpy array of the classes (Default is None)
            – test_size: the proportion of the dataset to be used for testing the model;
            the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25)
            – verbose: specifies whether or not to ouput any and all logging during model training (Default is False)

            Note: These are the only parameters allowed. All other parameters for each model will use their default
            values. For more granular control, please instantiate each model individually.

        The following instance data is found after running all_classification_models() successfully:

            – logistic_regression: a reference to the LogisticRegression model
            – MLP: a reference to the MLPClassifier model
            – random_forest: a reference to the RandomForest model
            – SVC: a reference to the SVC model
            – nu_SVC: a reference to the NuSVC model
            – linear_SVC: a reference to the LinearSVC model
            – XGB_classifier: a reference to the XGBClassifier model

        After running all_classification_models(), the mean accuracy, ROC-AUC (if available), and execution time for
        each model that ran successfully will be displayed in tabular form. Any models that failed to run will be listed.
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = test_size
        self.verbose = verbose

        self.logistic_regression = LogisticRegression(verbose=self.verbose)
        self.MLP = MLPClassifier(verbose=self.verbose)
        self.random_forest = RandomForestClassifier(verbose=self.verbose)
        self.SVC = SVC(verbose=self.verbose, probability=True)
        self.nu_SVC = NuSVC(verbose=self.verbose, probability=True)
        self.linear_SVC = LinearSVC(verbose=self.verbose)
        self.XGB_classifier = XGBClassifier(verbosity=int(self.verbose))

        self._classification_models = {
            "Model": ["Accuracy", "ROC-AUC", "Time"]
        }
        self._failures = []

    # Accessor methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If an AllClassificationModels object is initialized without specifying attributes, attributes will be None.
        all_classification_models() cannot be called until attributes is a populated numpy array of independent variables;
        call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If an AllClassificationModels object is initialized without specifying labels, labels will be None.
        all_classification_models() cannot be called until labels is a populated numpy array of classes;
        call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_verbose(self):
        """
        Accessor method for verbose.

        Will default to False if not set by the user.
        """
        return self.verbose

    def get_all_classification_models(self):
        """
        Accessor method that returns a list of all models.

        All models within the list will be None if all_classification_models() hasn't been called, yet.
        """
        return [
            self.logistic_regression, self.MLP, self.random_forest, self.SVC,
            self.nu_SVC, self.linear_SVC, self.XGB_classifier
        ]

    def get_logistic_regression(self):
        """
        Accessor method for logistic_regression.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.logistic_regression

    def get_MLP(self):
        """
        Accessor method for MLP.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.MLP

    def get_random_forest(self):
        """
        Accessor method for random_forest.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.random_forest

    def get_SVC(self):
        """
        Accessor method for SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.SVC

    def get_nu_SVC(self):
        """
        Accessor method for nu_SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.nu_SVC

    def get_linear_SVC(self):
        """
        Accessor method for linear_SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.linear_SVC

    def get_XGB_classifier(self):
        """
        Accessor method for XGB_classifier.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.XGB_classifier

    # Modifier methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a numpy array of independent variables. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a numpy array of classes. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a number or None. Defaults to 0.25.
        """
        self.test_size = new_test_size

    def set_verbose(self, new_verbose=False):
        """
        Modifier method for verbose.

        Input should be a truthy/falsy value. Defaults to False.
        """
        self.verbose = new_verbose

    # Classification functionality

    def all_classification_models(self):
        """
        Driver method for running all classification models with given attributes and labels.
        all_classification_models() first trains the models and determines their mean accuracy, ROC-AUC, and execution
        time via _all_classification_models_runner(). Then, all_classification_models() calls _print_results() to
        format and print each successful model's measurements, while also listing any failed models.

        If verbose is True, all verbose logging for each model will be enabled.
        If verbose is False, all logging to stdout and stderr will be suppressed.
        """

        # Call helper method for running all classification models; suppress output, if needed
        if not self.verbose:
            suppress_output = io.StringIO()
            with redirect_stderr(suppress_output), redirect_stdout(
                    suppress_output):
                self._all_classification_models_runner()
        else:
            self._all_classification_models_runner()

        # Print results
        self._print_results()

    # Helper methods

    def _all_classification_models_runner(self):
        """
        Helper method that runs all models using the given dataset and all default parameters.
        After running all models, each model is determined to be either a success or failure, and relevant data
        (accuracy, ROC-AUC, execution time) is recorded.

        _all_classification_models_runner() may only be called by all_classification_models().
        """

        # Split dataset
        dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

        # Run and time all models; identify each as success or failure
        try:
            start_time = time.time()
            self.logistic_regression.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["LogisticRegression"] =\
                [self.logistic_regression.score(dataset_X_test, dataset_y_test),
                roc_auc_score(self.logistic_regression.predict(dataset_X_test),
                              self.logistic_regression.predict_proba(dataset_X_test)[::, 1]),
                end_time - start_time]
        except:
            self._failures.append("LogisticRegression")

        try:
            start_time = time.time()
            self.MLP.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["MLPClassifier"] =\
                [self.MLP.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.MLP.predict(dataset_X_test), self.MLP.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("MLPClassifier")

        try:
            start_time = time.time()
            self.random_forest.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["RandomForest"] =\
                [self.random_forest.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.random_forest.predict(dataset_X_test),
                                self.random_forest.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("RandomForest")

        try:
            start_time = time.time()
            self.SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["SVC"] =\
                [self.SVC.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.SVC.predict(dataset_X_test), self.SVC.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("SVC")

        try:
            start_time = time.time()
            self.nu_SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["NuSVC"] =\
                [self.nu_SVC.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.nu_SVC.predict(dataset_X_test), self.nu_SVC.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("NuSVC")

        try:
            start_time = time.time()
            self.linear_SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["LinearSVC"] =\
                [self.linear_SVC.score(dataset_X_test, dataset_y_test), "Not Available", end_time - start_time]
        except:
            self._failures.append("LinearSVC")

        try:
            start_time = time.time()
            self.XGB_classifier.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["XGBClassifier"] =\
                [self.XGB_classifier.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.XGB_classifier.predict(dataset_X_test),
                                  self.XGB_classifier.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("XGBClassifier")

    def _print_results(self):
        """
        Helper method that prints results of _all_classification_models_runner() in tabular form.

        _print_results() may only be called by all_classification_models() after all models have attempted to run.
        """

        # Print models that didn't fail
        print("\nResults:\n")

        for model, data in self._classification_models.items():
            print("{:<20} {:<20} {:<20} {:<20}".format(model, data[0], data[1],
                                                       data[2]))

        print()

        # Print failures, if any
        if len(self._failures) > 0:
            print("The following models failed to run:\n")

            for entry in self._failures:
                print(entry)

        print()