def sigmoidNuSVC():
    maxRandomPerformance = []
    for gamma in xrange(1, 200):
        clf = NuSVC(kernel="sigmoid", gamma=gamma)
        clf.fit(trainData, trainLabel)
        maxRandomPerformance.append(clf.score(validationData, validationLabel))

    gammaValue = maxRandomPerformance.index(max(maxRandomPerformance)) + 1
    clfFinal = NuSVC(kernel='sigmoid', gamma=gammaValue)
    clfFinal.fit(trainData, trainLabel)
    score = clfFinal.score(testData, testLabel)

    guideToGraph['Sigmoid Nu-SVC'] = score
def sigmoidNuSVC():
    maxRandomPerformance = []
    for gamma in xrange(1,200):
        clf = NuSVC(kernel="sigmoid",gamma=gamma)
        clf.fit(trainData, trainLabel)
        maxRandomPerformance.append(clf.score(validationData, validationLabel))

    gammaValue = maxRandomPerformance.index(max(maxRandomPerformance)) + 1
    clfFinal = NuSVC(kernel='sigmoid', gamma=gammaValue)
    clfFinal.fit(trainData,trainLabel)
    score = clfFinal.score(testData,testLabel)

    guideToGraph['Sigmoid Nu-SVC'] = score
def polyNuSVC():
    maxRandomPerformance = []

    for deg in xrange(1,200):
        clf = NuSVC(kernel="poly",degree=deg)
        clf.fit(trainData, trainLabel)
        maxRandomPerformance.append(clf.score(validationData, validationLabel))

    gammaValue = maxRandomPerformance.index(max(maxRandomPerformance)) + 1
    clfFinal = NuSVC(kernel='poly', gamma=gammaValue)
    clfFinal.fit(trainData,trainLabel)
    score = clfFinal.score(testData,testLabel)

    guideToGraph['Polynomial Nu-SVC'] = score
def polyNuSVC():
    maxRandomPerformance = []

    for deg in xrange(1, 200):
        clf = NuSVC(kernel="poly", degree=deg)
        clf.fit(trainData, trainLabel)
        maxRandomPerformance.append(clf.score(validationData, validationLabel))

    gammaValue = maxRandomPerformance.index(max(maxRandomPerformance)) + 1
    clfFinal = NuSVC(kernel='poly', gamma=gammaValue)
    clfFinal.fit(trainData, trainLabel)
    score = clfFinal.score(testData, testLabel)

    guideToGraph['Polynomial Nu-SVC'] = score
Пример #5
0
def svm_nu(training, labels, test, real):
    #make lists of different parameters for SVC, then iterate through some of them
    kern = ['rbf','poly','linear']
    for i in range(len(kern)):
        model = NuSVC(kernel = kern[i], nu = 0.38, degree = 3, gamma = 0.00005, coef0 = 1)
        model.fit(training, labels) 
        accuracy = model.score(test, real)
        print("kernel: ", kern[i], ", accuracy: ", accuracy)
Пример #6
0
def svmClassifier():
    for deg in xrange(1,200):
        print deg
        print "RBF Nu-SVC"
        clf = NuSVC(gamma=deg)
        clf.fit(trainData, trainLabel)
        print(clf.score(testData,testLabel))

        print "LINEAR Nu-SVC"
        clf = NuSVC(kernel="linear")
        clf.fit(trainData, trainLabel)
        print(clf.score(testData,testLabel))

        print "POLYNOMIAL Nu-SVC"
        clf = NuSVC(kernel="poly",gamma=deg)
        clf.fit(trainData, trainLabel)
        print(clf.score(testData,testLabel))

        print "SIGMOID Nu-SVC"
        clf = NuSVC(kernel="sigmoid",gamma=deg)
        clf.fit(trainData, trainLabel)
        print(clf.score(testData,testLabel))
Пример #7
0
def svm(train_feature, train_label):
    clf = NuSVC(kernel='rbf', gamma='scale', probability=True)
    clf.fit(train_feature, train_label)
    pre_score = clf.predict_proba(train_feature)
    _pre_label = []
    _pre_score = []
    for item in pre_score:
        if item[0] > item[1]:
            _pre_label.append('0')
        else:
            _pre_label.append('1')
        _pre_score.append(item[1])
    print ('The auc is: {}'.format(roc_auc_score(train_label,_pre_score)))
    return clf.score(train_feature,train_label),_pre_score
def runClassifier(classifier, trainData,trainLabel, testData, testLabel, bestParameters):
    if classifier[0] == 'KNN':
        neighTest = KNeighborsClassifier(n_neighbors=int(bestParameters['KNN'][0]), algorithm='auto', p=2,weights=bestParameters['KNN'][1])
        neighTest.fit(trainData, trainLabel)
        scoreTest = neighTest.score(testData, testLabel)
        return scoreTest - classifier[1]
    elif classifier[0] == 'Random Forests':
        neighTest = RandomForestClassifier(n_estimators = int(bestParameters['Random Forests'][0]),criterion=bestParameters['Random Forests'][1])
        neighTest.fit(trainData, trainLabel)
        scoreTest = neighTest.score(testData, testLabel)
        return scoreTest - classifier[1]
    elif classifier[0] == 'Linear Nu-SVC':
        clf = NuSVC(kernel="linear")
        clf.fit(trainData, trainLabel)
        scoreTest = clf.score(testData, testLabel)
        return scoreTest - classifier[1]
    elif classifier[0] == 'RBF Nu-SVC':
        clfFinal = NuSVC(gamma = bestParameters['RBF Nu-SVC'])
        clfFinal.fit(trainData,trainLabel)
        score = clfFinal.score(testData,testLabel)
        return score - classifier[1]
    elif classifier[0] == 'Gradient Boosting':
        neighTest = GradientBoostingClassifier(n_estimators = int(bestParameters['Gradient Boosting'][0]),loss='deviance')
        neighTest.fit(trainData, trainLabel)
        scoreTest = neighTest.score(testData, testLabel)
        return scoreTest - classifier[1]
    elif classifier[0] == 'Multinomial Naive Bayes':
        clfTest = MultinomialNB(alpha = bestParameters['Multinomial Naive Bayes'], fit_prior=True)
        clfTest.fit(trainData, trainLabel)
        scoreTest = clfTest.score(testData, testLabel)
        return scoreTest - classifier[1]
    elif classifier[0] == 'Decision (IG)':
        clf = tree.DecisionTreeClassifier(criterion='entropy')
        clf.fit(trainData, trainLabel)
        scoreTest = clf.score(testData, testLabel)
        return scoreTest - classifier[1]
def nusvc_classifier(dir_models, ticket, x, x_test, y, y_test):
    print('getting model...NuSVC')
    clf = NuSVC(nu=0.8)

    print('training...')
    clf.fit(x, y)

    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))

    id = len(os.listdir(dir_models))
    joblib.dump(clf, dir_models + ticket + '_nusvc_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
Пример #10
0
def fd_svm_time(train, test, ytrain, ytest, seq):
    for i in range(len(train) - seq + 1):
        for j in range(1, seq):
            train[i] = train[i] + train[i + j]
    train = train[:-seq + 1]
    train = np.array(train).astype('float64')
    train_y = np.array(ytrain[seq - 1:]).astype('float64')
    for i in range(len(test) - seq + 1):
        for j in range(1, seq):
            test[i] = test[i] + test[i + j]
    test = test[:-seq + 1]
    test = np.array(test).astype('float64')
    test_y = np.array(ytest[seq - 1:]).astype('float64')
    clf = NuSVC()
    clf.fit(train, train_y)
    return clf.score(test, test_y)
Пример #11
0
def svm_models(x_train, y_train):
    from sklearn.svm import SVC
    classifier1 = SVC(kernel='rbf', random_state=0)
    classifier1.fit(x_train, y_train)

    from sklearn.svm import NuSVC
    classifier2 = NuSVC(kernel='rbf', random_state=0)
    classifier2.fit(x_train, y_train)

    from sklearn.svm import LinearSVC
    classifier3 = LinearSVC(dual=False)
    classifier3.fit(x_train, y_train)

    print('SVC training accuracy: ', classifier1.score(x_train, y_train))
    print('NuSVC training accuracy: ', classifier2.score(x_train, y_train))
    print('LinearSVC training accuracy: ', classifier3.score(x_train, y_train))

    return classifier1, classifier2, classifier3
class AdaBoostClassifier(Classifier):
    def __init__(self,
                 train_set=None,
                 val_set=None,
                 data_file=None,
                 header=0,
                 test_size=0.2,
                 feature_col_range=[1, 9],
                 label_col=-1,
                 features_degree=1,
                 features_scaling=False):
        Classifier.__init__(self, train_set, val_set, data_file, header,
                            test_size, feature_col_range, label_col,
                            features_degree, features_scaling)

    def fit(self):

        print('Using AdaBoost Classifier...')

        self.model = Model(nu=0.000001)

        self.model.fit(self.X_train, self.y_train)

        print('\nTrain Set Accuracy: ',
              self.model.score(self.X_train, self.y_train) * 100)

        # Predicting the Test set results
        if len(self.X_test) > 0:
            print('\nEvaluating on test set...')
            y_pred = self.predict(self.X_test)
            self.score = self.evaluate(X=self.X_test, y=self.y_test)

            # Making the Confusion Matrix
            self.cm = confusion_matrix(self.y_test,
                                       y_pred,
                                       labels=[i for i in range(11)])

    def save_model(self, file_name=None):
        if file_name is None:
            file_name = 'adaboost_model_' + str(
                int(round(self.score * 10000, 1)))
        joblib.dump(self.model, file_name)
def linearNuSVC():
    clf = NuSVC(kernel="linear")
    clf.fit(trainData, trainLabel)
    guideToGraph['Linear Nu-SVC'] = clf.score(validationData, validationLabel)
Пример #14
0
class SVM:
    """
    Wrapper class around scikit-learn's support vector machine functionality.
    This class supports binary and multi-class classification on a dataset, along with regression via Support Vector
    Regression (SVR).
    Per scikit-learn's documentation:

    Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and
    outliers detection.

    The advantages of support vector machines are:

        – Effective in high dimensional spaces.
        – Still effective in cases where number of dimensions is greater than the number of samples.
        – Uses a subset of training points in the decision function (called support vectors), so it is also memory
        efficient.
        – Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided,
        but it is also possible to specify custom kernels.

    The disadvantages of support vector machines include:

        – If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel
        functions and regularization term is crucial.
        – SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold
        cross-validation.
    """
    def __init__(self, attributes=None, labels=None, test_size=0.25):
        """
        Initializes a SVM object.

        The following parameters are needed to use a SVM:

            – attributes: a numpy array of the independent variables
            – labels: a numpy array of the classes (for classification) or dependent variables (for regression)
            – test_size: the proportion of the dataset to be used for testing the model (defaults to 0.25);
            the proportion of the dataset to be used for training will be the complement of test_size

        After successfully running one of the classifier methods (SVC(), nu_SVC(), or linear_SVC()), the corresponding
        classifier below will be trained:

            – classifier_SVC: a classifier trained using scikit-learn's SVC implementation
            – accuracy_SVC: the accuracy of the SVC model, based on its predictions for dataset_X_test
            – roc_auc_SVC: the area under the ROC curve for the SVC model
            – classifier_nu_SVC: a classifier trained using scikit-learn's NuSVC implementation
            – accuracy_nu_SVC: the accuracy of the NuSVC model, based on its predictions for dataset_X_test
            – roc_auc_nu_SVC: the area under the ROC curve for the NuSVC model
            – classifier_linear_SVC: a classifier trained using scikit-learn's LinearSVC implementation
            – accuracy_linear_SVC: the accuracy of the LinearSVC model, based on its predictions for dataset_X_test

        After successfully running one of the regression methods (SVR(), nu_SVR(), or linear_SVR()), the corresponding
        regression model below will be trained:

            – regression_SVR: a regression model trained using scikit-learn's SVR implementation
            – r2_score_SVR: the coefficient of determination for the SVR model
            – r_score_SVR: the correlation coefficient for the SVR model
            – regression_nu_SVR: a regression model trained using scikit-learn's NuSVR implementation
            – r2_score_nu_SVR: the coefficient of determination for the NuSVR model
            – r_score_nu_SVR: the correlation coefficient for the NuSVR model
            – regression_linear_SVR: a regression model trained using scikit-learn's LinearSVR implementation
            – r2_score_linear_SVR: the coefficient of determination for the LinearSVR model
            – r_score_linear_SVR: the correlation coefficient for the LinearSVR model
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = 0.25

        self.classifier_SVC = None
        self.accuracy_SVC = None
        self.roc_auc_SVC = None
        self.classifier_nu_SVC = None
        self.accuracy_nu_SVC = None
        self.roc_auc_nu_SVC = None
        self.classifier_linear_SVC = None
        self.accuracy_linear_SVC = None

        self.regression_SVR = None
        self.r2_score_SVR = None
        self.r_score_SVR = None
        self.regression_nu_SVR = None
        self.r2_score_nu_SVR = None
        self.r_score_nu_SVR = None
        self.regression_linear_SVR = None
        self.r2_score_linear_SVR = None
        self.r_score_linear_SVR = None

        # References to training and testing subsets of dataset; instance data for re-use purposes
        self.dataset_X_train = None
        self.dataset_y_train = None
        self.dataset_X_test = None
        self.dataset_y_test = None

    # Accessor Methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If a SVM object is initialized without specifying attributes, attributes will be None. No SVM functionality can
        be used until attributes is a populated numpy array. Call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If a SVM object is initialized without specifying labels, labels will be None. No SVM functionality can be used
        until labels is a populated numpy array. Call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_classifier_SVC(self):
        """
        Accessor method for classifier_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.classifier_SVC

    def get_accuracy_SVC(self):
        """
        Accessor method for accuracy_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.accuracy_SVC

    def get_roc_auc_SVC(self):
        """
        Accessor method for roc_auc_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_SVC

    def get_classifier_nu_SVC(self):
        """
        Accessor method for classifier_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.classifier_nu_SVC

    def get_accuracy_nu_SVC(self):
        """
        Accessor method for accuracy_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_nu_SVC

    def get_roc_auc_nu_SVC(self):
        """
        Accessor method for roc_auc_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_nu_SVC

    def get_classifier_linear_SVC(self):
        """
        Accessor method for classifier_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.classifier_linear_SVC

    def get_accuracy_linear_SVC(self):
        """
        Accessor method for accuracy_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_linear_SVC

    def get_regression_SVR(self):
        """
        Accessor method for regression_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.regression_SVR

    def get_r2_score_SVR(self):
        """
        Accessor method for r2_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r2_score_SVR

    def get_r_score_SVR(self):
        """
        Accessor method for r_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r_score_SVR

    def get_regression_nu_SVR(self):
        """
        Accessor method for regression_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.regression_nu_SVR

    def get_r2_score_nu_SVR(self):
        """
        Accessor method for r2_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_nu_SVR

    def get_r_score_nu_SVR(self):
        """
        Accessor method for r_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r_score_nu_SVR

    def get_regression_linear_SVR(self):
        """
        Accessor method for regression_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.regression_linear_SVR

    def get_r2_score_linear_SVR(self):
        """
        Accessor method for r2_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_linear_SVR

    def get_r_score_linear_SVR(self):
        """
        Accessor method for r_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r_score_linear_SVR

    # Modifier Methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a populated numpy array. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a populated numpy array. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a float between 0.0 and 1.0 or None. Defaults to 0.25. The training size will be set to the
        complement of test_size.
        """
        self.test_size = new_test_size

    # Wrappers for SVM classification classes

    def SVC(self,
            C=1.0,
            kernel="rbf",
            degree=3,
            gamma="scale",
            coef0=0.0,
            shrinking=True,
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False,
            max_iter=-1,
            decision_function_shape="ovr",
            break_ties=False,
            random_state=None):
        """
        Wrapper for scikit-learn's C-Support Vector Classification implementation.
        Parameters per scikit-learn's documentation:

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm. The fit time scales at least quadratically with the number of samples
        and may be impractical beyond tens of thousands of samples.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_SVC =\
                SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                    probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose,
                    max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties,
                    random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_SVC.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_SVC = self.classifier_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_SVC = roc_auc_score(
                    self.classifier_SVC.predict(self.dataset_X_test),
                    self.classifier_SVC.predict_proba(self.dataset_X_test)[::,
                                                                           1])

    def nu_SVC(self,
               nu=0.5,
               kernel="rbf",
               degree=3,
               gamma="scale",
               coef0=0.0,
               shrinking=True,
               probability=False,
               tol=0.001,
               cache_size=200,
               class_weight=None,
               verbose=False,
               max_iter=-1,
               decision_function_shape="ovr",
               break_ties=False,
               random_state=None):
        """
        Wrapper for scikit-learn's Nu-Support Vector Classification implementation.
        Per scikit-learn's documentation, NuSVC is similar to SVC, but uses a parameter, nu, to set the number of
        support vectors.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_nu_SVC =\
                NuSVC(nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                      probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight,
                      verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape,
                      break_ties=break_ties, random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_nu_SVC.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_nu_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_nu_SVC = self.classifier_nu_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_nu_SVC = roc_auc_score(
                    self.classifier_nu_SVC.predict(self.dataset_X_test),
                    self.classifier_nu_SVC.predict_proba(
                        self.dataset_X_test)[::, 1])

    def linear_SVC(self,
                   penalty="l2",
                   loss="squared_hinge",
                   dual=True,
                   tol=0.0001,
                   C=1.0,
                   multi_class='ovr',
                   fit_intercept=True,
                   intercept_scaling=1,
                   class_weight=None,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Classification implementation. Per scikit-learn's documentation,
        LinearSVC is similar to SVC with a linear kernel, but implemented with liblinear instead of libsvm, providing
        more flexibility in choice of penalties and loss functions. LinearSVC should also scale better to large sample
        sizes. LinearSVC supports both dense and sparse input, and the multiclass support is handled according to a
        one-vs-the-rest scheme.
        Parameters per scikit-learn's documentation:

            – penalty: Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The
            ‘l1’ leads to coef_ vectors that are sparse. (Default is "l2")

            – loss: Specifies the loss function. ‘hinge’ is the standard SVM loss (used e.g. by the SVC class) while
            ‘squared_hinge’ is the square of the hinge loss. (Default is "squared_hinge")

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)
            
            – tol: Tolerance for stopping criteria. (Default is 1e-4, or 0.0001)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be
            strictly positive. (Default is 1.0)
            
            – multi_class: Determines the multi-class strategy if y contains more than two classes. "ovr" trains
            n_classes one-vs-rest classifiers, while "crammer_singer" optimizes a joint objective over all classes.
            While crammer_singer is interesting from a theoretical perspective as it is consistent, it is seldom used
            in practice as it rarely leads to better accuracy and is more expensive to compute. If "crammer_singer" is
            chosen, the options loss, penalty and dual will be ignored. (Default is "ovr")
            
            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_linear_SVC =\
                LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol, C=C, multi_class=multi_class,
                          fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
                          verbose=verbose, random_state=random_state, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_linear_SVC.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_linear_SVC = None
                return

            # Evaluate accuracy of model using testing set and actual classification
            self.accuracy_linear_SVC = self.classifier_linear_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

    # Wrappers for SVM regression classes

    def SVR(self,
            kernel='rbf',
            degree=3,
            gamma='scale',
            coef0=0.0,
            tol=0.001,
            C=1.0,
            epsilon=0.1,
            shrinking=True,
            cache_size=200,
            verbose=False,
            max_iter=-1):
        """
        Wrapper for scikit-learn's Epsilon-Support Vector Regression implementation. Per scikit-learn's documentation,
        this implementation is based on libsvm. Scaling to tens of thousands of samples is difficult, as the fit time
        complexity is more than quadratic with the number of samples. For large datasets, consider using LinearSVR by
        calling linear_SVR().
        Parameters per scikit-learn's documentation:

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – shrinking: Whether to use the shrinking heuristic. (Default is True)

            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)

            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_SVR =\
                SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon,
                    shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_SVR.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_SVR = self.regression_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_SVR = sqrt(self.r2_score_SVR)

    def nu_SVR(self,
               nu=0.5,
               C=1.0,
               kernel='rbf',
               degree=3,
               gamma='scale',
               coef0=0.0,
               shrinking=True,
               tol=0.001,
               cache_size=200,
               verbose=False,
               max_iter=-1):
        """
        Wrapper for scikit-learn's Nu Support Vector Regression implementation. Per scikit-learn's documentation,
        NuSVR uses the parameter nu to control the number of support vectors, similar to NuSVC. Yet unlike NuSVC,
        nu replaces the parameter epsilon of epsilon-SVR, not C. This implementation is based on libsvm.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
                        
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_nu_SVR =\
                NuSVR(nu=nu, C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, tol=tol,
                      cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_nu_SVR.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_nu_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_nu_SVR = self.regression_nu_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_nu_SVR = sqrt(self.r2_score_nu_SVR)

    def linear_SVR(self,
                   epsilon=0.0,
                   tol=0.0001,
                   C=1.0,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Regression implementation. Per scikit-learn's documentation,
        LinearSVR is similar to SVR with a linear kernel, but is implemented with liblinear instead of libsvm. This
        provides greater flexibility in choice of penalties and loss functions, and should scale better to large sample
        sizes. LinearSVM supports both dense and sparse input.
        Parameters per scikit-learn's documentation:

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – loss: Specifies the loss function. The epsilon-insensitive loss (standard SVR) is the L1 loss, while the
            squared epsilon-insensitive loss (‘squared_epsilon_insensitive’) is the L2 loss.
            (Default is "epsilon_insensitive")

            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_linear_SVR =\
                LinearSVR(epsilon=epsilon, tol=tol, C=C, loss=loss, fit_intercept=fit_intercept,
                          intercept_scaling=intercept_scaling, dual=dual, verbose=verbose, random_state=random_state,
                          max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or labels isn't
            # quantitative data
            try:
                self.regression_linear_SVR.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_linear_SVR = None
                return

            # Get coefficient of determination and correlation coefficient for model
            self.r2_score_linear_SVR = self.regression_linear_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_linear_SVR = sqrt(self.r2_score_linear_SVR)

    # Helper methods

    def _split_data(self):
        """
        Helper method for splitting attributes and labels into training and testing sets.

        This method runs under the assumption that all relevant instance data has been checked for correctness.
        """

        self.dataset_X_train, self.dataset_X_test, self.dataset_y_train, self.dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

    def _check_inputs(self):
        """
        Verifies if instance data is ready for use in SVM model.
        """

        # Check if attributes exists
        if self.attributes is None:
            print(
                "attributes is missing; call set_attributes(new_attributes) to fix this! new_attributes should be a",
                "populated dataset of independent variables.")
            return False

        # Check if labels exists
        if self.labels is None:
            print(
                "labels is missing; call set_labels(new_labels) to fix this! new_labels should be a populated dataset",
                "of classes.")
            return False

        # Check if attributes and labels have same number of rows (samples)
        if self.attributes.shape[0] != self.labels.shape[0]:
            print(
                "attributes and labels don't have the same number of rows. Make sure the number of samples in each",
                "dataset matches!")
            return False

        # Check if test_size is a number
        if self.test_size is not None and not isinstance(
                self.test_size, (int, float)):
            print(
                "test_size must be None or a number; call set_test_size(new_test_size) to fix this!"
            )
            return False

        return True
Пример #15
0
                    LinearSVC(random_state=rs, tol=1e-5)).fit(X_train, y_train)

##############################################################################
DTC = accuracy_score(y_test, dtc.predict(X_test))
RFC = accuracy_score(y_test, rfc.predict(X_test))
BAG = accuracy_score(y_test, bag.predict(X_test))
EXT = accuracy_score(y_test, ext.predict(X_test))
ADA = accuracy_score(y_test, ada.predict(X_test))
KNN = accuracy_score(y_test, knn_clf.predict(X_test))
NBG = accuracy_score(y_test, gaussian_nb.predict(X_test))
NBB = accuracy_score(y_test, bernoulli_nb.predict(X_test))
MLP = accuracy_score(y_train, mlp.predict(X_train))
MLPs = mlp_clf.score(X_train, y_train)
LOG = lr_clf.score(X_train,y_train)
SVM = accuracy_score(y_test, svm_clf.predict(X_test))
NSVC = nsvc.score(X_train, y_train)
RDG = accuracy_score(y_test, rdg_clf.predict(X_test))
GB = accuracy_score(y_test, gb.predict(X_test))
SGD = sgd_clf.score(X_train,y_train)
LSVC = accuracy_score(y_test, lsvc.predict(X_test))
###################################################
dt = dtc.score(X_test, y_test)
rf = rfc.score(X_test, y_test)
bg = bag.score(X_test, y_test)
et = ext.score(X_test, y_test)
ad = ada.score(X_test, y_test)
kn = knn_clf.score(X_test, y_test)
ng = gaussian_nb.score(X_test, y_test)
nb = bernoulli_nb.score(X_test, y_test)
m1 = mlp.score(X_test, y_test)
m2 = mlp_clf.score(X_test, y_test)
Пример #16
0
wine = load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data,
                                                    wine.target,
                                                    test_size=0.3)

model_svc = SVC(kernel='linear')
model_linearsvc = LinearSVC(loss='squared_hinge', multi_class='ovr')
model_nusvc = NuSVC(nu=0.5, kernel='rbf')

model_svc.fit(X_train, y_train)
model_linearsvc.fit(X_train, y_train)
model_nusvc.fit(X_train, y_train)

print("svc train score", model_svc.score(X_train, y_train))
print("linearsvc train score", model_linearsvc.score(X_train, y_train))
print("nusvc train score", model_nusvc.score(X_train, y_train))

# 保存模型
joblib.dump(model_svc, "svm.model")
# 加载模型
#model_svc = joblib.load("svm.model")

#获得支持向量
print("支持向量: ", model_svc.support_vectors_)
#获得支持下向量的索引
print("支持下向量的索引: ", model_svc.support_)
#为每一个类别获得支持向量的数量
print("支持向量的数量: ", model_svc.n_support_)

# 对测试集做预测
y_pred = model_svc.predict(X_test)
from sklearn.externals import joblib
import os
import face_detector
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#test_images = np.load('test_images.npy')
#test_target_ages = np.load('test_target_ages.npy')
#test_target_genders = np.load('test_target_genders.npy')

if __name__ == '__main__':
    print 'Loading training data and labels'
    train_images = np.load('train_images.npy')
    train_targets = np.load('train_targets.npy')
    print train_images.shape

    print 'Loading validation data and labels'
    val_images = np.load('val_images.npy')
    val_targets = np.load('val_targets.npy')
    print val_images.shape

    nu_svm = NuSVC(nu=0.3, kernel='rbf')
    print 'starting train'
    nu_svm.fit(train_images, train_targets)

    print 'validation acc:'
    print nu_svm.score(val_images, val_targets)
    #print gender_nu_svm.score(val_images_genders, val_target_genders)

    joblib.dump(nu_svm, 'SVM_8496_3.pkl')
Пример #18
0
MNB_y_predict = MNB.predict(X_test)
KNC.fit(X_train, y_train)
KNC_y_predict = KNC.predict(X_test)
DTC.fit(X_train, y_train)
DTC_y_predict = DTC.predict(X_test)
RFC.fit(X_train, y_train)
RFC_y_predict = RFC.predict(X_test)
GBDT.fit(X_train, y_train)
GBDT_y_predict = GBDT.predict(X_test)
xgbc.fit(X_train, y_train)
xgbc_y_predict = xgbc.predict(X_test)
#rating
from sklearn.metrics import f1_score
print 'The accuracy of LR is:', LR.score(X_test, y_test)
print 'The accuracy of SGDClassifier is:', SGDC.score(X_test, y_test)
print 'The accuracy of Linear_SVC is:', Linear_SVC.score(X_test, y_test)
print 'The accuracy of Poly_SVC is:', Poly_SVC.score(X_test, y_test)
print 'The accuracy of Rbf_SVC is:', Rbf_SVC.score(X_test, y_test)
print 'The accuracy of Sigmoid_SVC is:', Sigmoid_SVC.score(X_test, y_test)
print 'The accuracy of Naive Bayes is:', MNB.score(X_test, y_test)
print 'The accuracy of KNC is:', KNC.score(X_test, y_test)
print 'The accuracy of decition tree is:', DTC.score(X_test, y_test)
print 'The accuracy of Random Forest is:', RFC.score(X_test, y_test)
print 'The accuracy of GBDT is:', GBDT.score(X_test, y_test)
print 'The accuracy of XGBoost is', xgbc.score(X_test, y_test)

print 'The F1-score of LR is:', f1_score(y_test, LR_y_predict)
print 'The F1-score of SGDClassifier is:', f1_score(y_test, SGDC_y_predict)
print 'The F1-score of Linear_SVC is:', f1_score(y_test, Linear_SVC_y_predict)
print 'The F1-score of Poly_SVC is:', f1_score(y_test, Poly_SVC_y_predict)
print 'The F1-score of Rbf_SVC is:', f1_score(y_test, Rbf_SVC_y_predict)
0.874      | (+/-0.026)       | {'C': 50, 'gamma': 'auto', 'kernel': 'rbf'}
0.870      | (+/-0.033)       | {'C': 50, 'gamma': 'auto', 'kernel': 'sigmoid'}
0.866      | (+/-0.020)       | {'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}
0.864      | (+/-0.019)       | {'C': 10, 'gamma': 0.05, 'kernel': 'rbf'}
0.864      | (+/-0.019)       | {'C': 50, 'gamma': 0.05, 'kernel': 'rbf'}
0.856      | (+/-0.014)       | {'C': 1.0, 'gamma': 0.05, 'kernel': 'rbf'}
0.822      | (+/-0.057)       | {'C': 1.0, 'gamma': 'auto', 'kernel': 'rbf'}
0.754      | (+/-0.053)       | {'C': 1.0, 'gamma': 0.02, 'kernel': 'sigmoid'}

结论:
1. rbf 优于 sigmoid
2. gamma: auto的效果并不好, 结果来看score最佳为 0.02
3. C取10和50甚至100,对mean score无明显影响,但C取1的时候,均方差偏大。
4. 1000的样本数太少,仅供参考
"""

svc_clf.fit(X_train_small.astype('float') / 256, y_train_small)

y_test = svc_clf.predict(X_test)
print("\nsvc_clf.score(X_test,y_test):")
print(y_test)
print(svc_clf.score(X_test, y_test))

gs_clf = GridSearchCV(svc_clf, parameters, n_jobs=1, verbose=1)
gs_clf.fit(X_train_small.astype('float') / 256, y_train_small)
print_grid_mean(gs_clf.grid_scores_)

# end time
elapsed = (time.clock() - start)
print("Time used:", elapsed)
)
print "Cross-domain error for kitchen-electronics (trained PCA kitchen data to predict test PCA electroincs data)", kitchenSVC.score(
    electronics_test_matrix.todense(), test_label
)
print "In-domain error for kitchen-kitchen (trained PCA kitchen data to predict test PCA kitchen data)", kitchenSVC.score(
    kitchen_test_matrix.todense(), test_label
)


print "----PCA + NON-linear SVM"

# NuSVC results book->all
book_clf = NuSVC()
book_clf.fit(book_train_new_fit, train_label)
print "In-domain error for book-book (trained PCA book data to predict test PCA book data)", book_clf.score(
    book_test_new_fit, test_label
)
print "Cross-domain error for book-DVD (trained PCA book data to predict test PCA DVD data)", book_clf.score(
    DVD_test_new_fit, test_label
)
print "Cross-domain error for book-electronics (trained PCA book data to predict test PCA electroincs data)", book_clf.score(
    electronics_test_new_fit, test_label
)
print "Cross-domain error for book-kitchen (trained PCA book data to predict test PCA kitchen data)", book_clf.score(
    kitchen_test_new_fit, test_label
)

# NuSVC results DVD->all
DVD_clf = NuSVC()
DVD_clf.fit(DVD_train_new_fit, train_label)
print "Cross-domain error for DVD-book (trained PCA DVD data to predict test PCA book data) ", DVD_clf.score(
Пример #21
0
lsvr = LinearSVR()
print 'LinearSVR config:'
print svc.get_params()
lsvr.fit(smr_train.feature_matrix, smr_train.labels)
lsvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'LinearSVR precision train: {}'.format(lsvr_score_train)
lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels)
print 'LinearSVR precision test: {}'.format(lsvr_score_test)
print ''

nusvc = NuSVC()
print 'NuSVC config:'
print nusvc.get_params()
nusvc.fit(smr_train.feature_matrix, smr_train.labels)
nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVC precision train: {}'.format(nusvc_score_train)
nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVC precision test: {}'.format(nusvc_score_test)
print ''

nusvr = NuSVR()
print 'NuSVR config:'
print nusvr.get_params()
nusvr.fit(smr_train.feature_matrix, smr_train.labels)
nusvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVR precision train: {}'.format(nusvr_score_train)
nusvr_score_test = nusvr.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVR precision test: {}'.format(nusvr_score_test)
print ''
Пример #22
0
def fd_svm(train, test, ytrain, ytest):
    clf = NuSVC()
    clf.fit(train, ytrain)
    return clf.score(test, ytest)
def linearNuSVC():
    clf = NuSVC(kernel="linear")
    clf.fit(trainData, trainLabel)
    guideToGraph['Linear Nu-SVC'] = clf.score(validationData, validationLabel)
class AllClassificationModels:
    """
    Wrapper class around all supported classification models: LogisticRegression, MLPClassifier, RandomForest, SVC,
    NuSVC, LinearSVC, and XGBClassifier.
    AllClassificationModels runs every available classification algorithm on the given dataset and outputs the mean
    accuracy, ROC-AUC, and execution time of each successful model when all_classification_models() is run.
    """
    def __init__(self,
                 attributes=None,
                 labels=None,
                 test_size=0.25,
                 verbose=False):
        """
        Initializes an AllClassificationModels object.

        The following parameters are needed to use an AllClassificationModels object:

            – attributes: a numpy array of the desired independent variables (Default is None)
            – labels: a numpy array of the classes (Default is None)
            – test_size: the proportion of the dataset to be used for testing the model;
            the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25)
            – verbose: specifies whether or not to ouput any and all logging during model training (Default is False)

            Note: These are the only parameters allowed. All other parameters for each model will use their default
            values. For more granular control, please instantiate each model individually.

        The following instance data is found after running all_classification_models() successfully:

            – logistic_regression: a reference to the LogisticRegression model
            – MLP: a reference to the MLPClassifier model
            – random_forest: a reference to the RandomForest model
            – SVC: a reference to the SVC model
            – nu_SVC: a reference to the NuSVC model
            – linear_SVC: a reference to the LinearSVC model
            – XGB_classifier: a reference to the XGBClassifier model

        After running all_classification_models(), the mean accuracy, ROC-AUC (if available), and execution time for
        each model that ran successfully will be displayed in tabular form. Any models that failed to run will be listed.
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = test_size
        self.verbose = verbose

        self.logistic_regression = LogisticRegression(verbose=self.verbose)
        self.MLP = MLPClassifier(verbose=self.verbose)
        self.random_forest = RandomForestClassifier(verbose=self.verbose)
        self.SVC = SVC(verbose=self.verbose, probability=True)
        self.nu_SVC = NuSVC(verbose=self.verbose, probability=True)
        self.linear_SVC = LinearSVC(verbose=self.verbose)
        self.XGB_classifier = XGBClassifier(verbosity=int(self.verbose))

        self._classification_models = {
            "Model": ["Accuracy", "ROC-AUC", "Time"]
        }
        self._failures = []

    # Accessor methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If an AllClassificationModels object is initialized without specifying attributes, attributes will be None.
        all_classification_models() cannot be called until attributes is a populated numpy array of independent variables;
        call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If an AllClassificationModels object is initialized without specifying labels, labels will be None.
        all_classification_models() cannot be called until labels is a populated numpy array of classes;
        call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_verbose(self):
        """
        Accessor method for verbose.

        Will default to False if not set by the user.
        """
        return self.verbose

    def get_all_classification_models(self):
        """
        Accessor method that returns a list of all models.

        All models within the list will be None if all_classification_models() hasn't been called, yet.
        """
        return [
            self.logistic_regression, self.MLP, self.random_forest, self.SVC,
            self.nu_SVC, self.linear_SVC, self.XGB_classifier
        ]

    def get_logistic_regression(self):
        """
        Accessor method for logistic_regression.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.logistic_regression

    def get_MLP(self):
        """
        Accessor method for MLP.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.MLP

    def get_random_forest(self):
        """
        Accessor method for random_forest.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.random_forest

    def get_SVC(self):
        """
        Accessor method for SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.SVC

    def get_nu_SVC(self):
        """
        Accessor method for nu_SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.nu_SVC

    def get_linear_SVC(self):
        """
        Accessor method for linear_SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.linear_SVC

    def get_XGB_classifier(self):
        """
        Accessor method for XGB_classifier.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.XGB_classifier

    # Modifier methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a numpy array of independent variables. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a numpy array of classes. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a number or None. Defaults to 0.25.
        """
        self.test_size = new_test_size

    def set_verbose(self, new_verbose=False):
        """
        Modifier method for verbose.

        Input should be a truthy/falsy value. Defaults to False.
        """
        self.verbose = new_verbose

    # Classification functionality

    def all_classification_models(self):
        """
        Driver method for running all classification models with given attributes and labels.
        all_classification_models() first trains the models and determines their mean accuracy, ROC-AUC, and execution
        time via _all_classification_models_runner(). Then, all_classification_models() calls _print_results() to
        format and print each successful model's measurements, while also listing any failed models.

        If verbose is True, all verbose logging for each model will be enabled.
        If verbose is False, all logging to stdout and stderr will be suppressed.
        """

        # Call helper method for running all classification models; suppress output, if needed
        if not self.verbose:
            suppress_output = io.StringIO()
            with redirect_stderr(suppress_output), redirect_stdout(
                    suppress_output):
                self._all_classification_models_runner()
        else:
            self._all_classification_models_runner()

        # Print results
        self._print_results()

    # Helper methods

    def _all_classification_models_runner(self):
        """
        Helper method that runs all models using the given dataset and all default parameters.
        After running all models, each model is determined to be either a success or failure, and relevant data
        (accuracy, ROC-AUC, execution time) is recorded.

        _all_classification_models_runner() may only be called by all_classification_models().
        """

        # Split dataset
        dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

        # Run and time all models; identify each as success or failure
        try:
            start_time = time.time()
            self.logistic_regression.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["LogisticRegression"] =\
                [self.logistic_regression.score(dataset_X_test, dataset_y_test),
                roc_auc_score(self.logistic_regression.predict(dataset_X_test),
                              self.logistic_regression.predict_proba(dataset_X_test)[::, 1]),
                end_time - start_time]
        except:
            self._failures.append("LogisticRegression")

        try:
            start_time = time.time()
            self.MLP.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["MLPClassifier"] =\
                [self.MLP.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.MLP.predict(dataset_X_test), self.MLP.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("MLPClassifier")

        try:
            start_time = time.time()
            self.random_forest.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["RandomForest"] =\
                [self.random_forest.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.random_forest.predict(dataset_X_test),
                                self.random_forest.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("RandomForest")

        try:
            start_time = time.time()
            self.SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["SVC"] =\
                [self.SVC.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.SVC.predict(dataset_X_test), self.SVC.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("SVC")

        try:
            start_time = time.time()
            self.nu_SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["NuSVC"] =\
                [self.nu_SVC.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.nu_SVC.predict(dataset_X_test), self.nu_SVC.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("NuSVC")

        try:
            start_time = time.time()
            self.linear_SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["LinearSVC"] =\
                [self.linear_SVC.score(dataset_X_test, dataset_y_test), "Not Available", end_time - start_time]
        except:
            self._failures.append("LinearSVC")

        try:
            start_time = time.time()
            self.XGB_classifier.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["XGBClassifier"] =\
                [self.XGB_classifier.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.XGB_classifier.predict(dataset_X_test),
                                  self.XGB_classifier.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("XGBClassifier")

    def _print_results(self):
        """
        Helper method that prints results of _all_classification_models_runner() in tabular form.

        _print_results() may only be called by all_classification_models() after all models have attempted to run.
        """

        # Print models that didn't fail
        print("\nResults:\n")

        for model, data in self._classification_models.items():
            print("{:<20} {:<20} {:<20} {:<20}".format(model, data[0], data[1],
                                                       data[2]))

        print()

        # Print failures, if any
        if len(self._failures) > 0:
            print("The following models failed to run:\n")

            for entry in self._failures:
                print(entry)

        print()
Пример #25
0
def SVC():
    actions = [
        'change_lane', 'pull_over', 'slow', 'stop', 'straight', 'turn_left',
        'turn_right', 'wait_to_turn_left'
    ]

    data_points = []
    with open('/Users/zephyryau/Documents/study/INF552/Project/data.csv',
              'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                data_points.append(sample)

    data_points_xycl = np.array(data_points)
    data_points_xyc = data_points_xycl[:, :-1]
    y = data_points_xycl[:, -1]

    # centralize datapoints and normalize
    data_points_xy_cent = []
    for row in data_points_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        data_points_xy_cent.append(new_row)

    result_point = []
    with open(
            '/Users/zephyryau/Documents/study/INF552/Project/input_picture/result.csv',
            'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                result_point.append(sample)

    result_point_xycl = np.array(result_point)
    result_point_xyc = result_point_xycl[:, :-1]
    result_point_y = result_point_xycl[:, -1]

    result_point_xy_cent = []
    for row in result_point_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        result_point_xy_cent.append(new_row)

    X_train, X_test, y_train, y_test = train_test_split(data_points_xy_cent,
                                                        y,
                                                        test_size=0.4)
    scaler = preprocessing.StandardScaler().fit(X_train)
    #print(scaler.mean_)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    r_X_scaled = scaler.transform(result_point_xy_cent)

    sum = 0
    clf = NuSVC(nu=0.2)
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            data_points_xy_cent, y, test_size=0.4)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        clf.fit(X_train_scaled, y_train)
        #print(clf.n_iter_, end=" ")=
        score_train = clf.score(X_train_scaled, y_train)
        #print(score_train, end=" ")
        score_test = clf.score(X_test_scaled, y_test)
        sum += score_test  #print(score_test)

    tf = (clf.predict([r_X_scaled[0]])[0] == result_point_y[0])

    return clf.predict([r_X_scaled[0]])[0], sum / 10, tf
Пример #26
0
import numpy as np
import scipy.io as sio
import tensorflow as tf
import tensorflow.contrib as skflow
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.cross_validation import *
from sklearn.svm import SVC, LinearSVC, NuSVC

data = sio.loadmat("/home/hardik/Desktop/MTech_Project/MTechData/MingData/SURFFeatures.mat")
X = data['X']
y = data['y']
y2 = data['y2']

X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.25)

y_train = y_train.flatten()
y_test = y_test.flatten()

print(y_train.flatten().shape)

model1 = NuSVC()
model1.fit(X_train, y_train)
print(model1.score(X_test, y_test))
SVC_classifier.fit(train_arrays, train_labels)
print('SVC Accuracy: %.2f' %SVC_classifier.score(test_arrays, test_labels))

LinearSVC_classifier = LinearSVC(multi_class='ovr')
LinearSVC_classifier.fit(train_arrays, train_labels)
print('LinearSVC Accuracy: %.2f' %LinearSVC_classifier.score(test_arrays, test_labels))

SGD_classifier = SGDClassifier()
SGD_classifier.fit(train_arrays, train_labels)
print('SGDClassifier Accuracy: %.2f' %SGD_classifier.score(test_arrays, test_labels))


try:
    NuSVC_classifier = NuSVC()
    NuSVC_classifier.fit(train_arrays, train_labels)
    print('NuSVC Accuracy: %.2f' %NuSVC_classifier.score(test_arrays, test_labels))
except:
    pass

try:
    MultinomialNB_classifier = MultinomialNB()
    MultinomialNB_classifier.fit(train_arrays, train_labels)
    print('MultinomialNB Accuracy: %.2f' %MultinomialNB_classifier.score(test_arrays, test_labels))
except:
    pass

try:
    BernoulliNB_classifier = BernoulliNB()
    BernoulliNB_classifier.fit(train_arrays, train_labels)
    print('BernoulliNB Accuracy: %.2f' %BernoulliNB_classifier.score(test_arrays, test_labels))
except:
Пример #28
0
X_test_features_1_ttb_kernel = cuda_kernel_features_test(
    one_dim_ttb_train_, one_dim_ttb_test_, index_train_1_ttb, index_test_1_ttb,
    s)
X_test_features_1_btt_kernel = cuda_kernel_features_test(
    one_dim_btt_train_, one_dim_btt_test_, index_train_1_btt, index_test_1_btt,
    s)

print("Cuda time: ", time.time() - start)

train_kernel = X_train_features_1_btt_kernel + X_train_features_1_ttb_kernel + X_train_features_1_rtl_kernel + X_train_features_1_ltr_kernel + X_train_features_0_btt_kernel + X_train_features_0_ttb_kernel + X_train_features_0_rtl_kernel + X_train_features_0_ltr_kernel
test_kernel = X_test_features_1_btt_kernel + X_test_features_1_ttb_kernel + X_test_features_1_rtl_kernel + X_test_features_1_ltr_kernel + X_test_features_0_btt_kernel + X_test_features_0_ttb_kernel + X_test_features_0_rtl_kernel + X_test_features_0_ltr_kernel

svc_model = NuSVC(kernel='precomputed')
svc_model.fit(train_kernel, y_train)

kernel_numba_train_accuracy_svm = svc_model.score(train_kernel, y_train)
print("score numba train:", svc_model.score(train_kernel, y_train))
kernel_numba_test_accuracy_svm = svc_model.score(test_kernel, y_test)
print("score numba test:", svc_model.score(test_kernel, y_test))

kernel_numba_train_accuracy_svm = np.array(kernel_numba_train_accuracy_svm)
kernel_numba_train_accuracy_svm = np.reshape(kernel_numba_train_accuracy_svm,
                                             newshape=(-1, 1))
kernel_numba_train_mean = np.mean(kernel_numba_train_accuracy_svm)
kernel_numba_train_std = np.std(kernel_numba_train_accuracy_svm)

kernel_numba_test_accuracy_svm = np.array(kernel_numba_test_accuracy_svm)
kernel_numba_test_accuracy_svm = np.reshape(kernel_numba_test_accuracy_svm,
                                            newshape=(-1, 1))
kernel_numba_test_mean = np.mean(kernel_numba_test_accuracy_svm)
kernel_numba_test_std = np.std(kernel_numba_test_accuracy_svm)
Пример #29
0
    clf = svm.SVC(probability=True)
else:
    clf = NuSVC()

clf.fit(train_data, train_labels)
print('Training done')

print('PREDICTION FOR VALIDATION DATA')
Y = clf.predict(validation_data)

correct = 0
for i in range(len(Y)):
    # print(i,Y[i])
    if (Y[i] == validation_labels[i]):
        correct = correct + 1
print("TRAINING SCORE BY SVM = ", 100 * clf.score(train_data, train_labels))
print('validation_correctly_predicted = ', correct, ' OUTOFF = ', len(Y),
      ' ACCURACY = ', (float)(correct * 100) / len(Y))
print('TRAINING DATA Prediction done')

print('saving model')

if (sys.argv[1] == '0'):
    s = joblib.dump(clf, 'model_svm_linear.pkl')
    # s = joblib.dump(clf,'model.pkl')

elif (sys.argv[1] == '1'):
    s = joblib.dump(clf, 'model_svm_svc.pkl')
else:
    s = joblib.dump(clf, 'model_svm_nu.pkl')
Пример #30
0
# Y is just the class or the diagnosis column
y = np.array(dataframe['val'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2)

clf_lsvc = LinearSVC()
clf = svm.SVC()
clf_nu = NuSVC()

clf.fit(X_train, y_train)
clf_nu.fit(X_train, y_train)
clf_lsvc.fit(X_train, y_train)

accuracy_svm = clf.score(X_test, y_test)
accuracy_nu = clf_nu.score(X_test, y_test)
accuracy_lsvc = clf_lsvc.score(X_test, y_test)

print("SVC", accuracy_svm)
print("NuSVC", accuracy_nu)
print("Linear SVC", accuracy_lsvc)

# Cross Validation
predicted_svm = cross_val_predict(clf, X, y, cv=10)
predicted_nu = cross_val_predict(clf_nu, X, y, cv=10)
predicted_lsvc = cross_val_predict(clf_lsvc, X, y, cv=10)

print("SVC Cross-validation", metrics.accuracy_score(y, predicted_svm))
print("NuSVC Cross-validation", metrics.accuracy_score(y, predicted_nu))
print("Linear Cross-validation", metrics.accuracy_score(y, predicted_lsvc))
Пример #31
0
nuSVC = NuSVC(nu=0.5,
              kernel='rbf',
              degree=3,
              gamma='scale',
              coef0=0,
              shrinking=True,
              probability=False,
              max_iter=-1,
              decision_function_shape='ovr')
nuSVC.fit(X_train, y_train)

# make predictions for test data
y_pred = nuSVC.predict(X_val)

# EVALUATE PREDICTIONS
score = nuSVC.score(X, y)
accuracy = accuracy_score(y_val, y_pred)
f1_score = f1_score(y_val, y_pred)
print("Score: %.2f%%" % (score * 100.0))
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("f1 score: %.2f%%" % (f1_score * 100.0))

# PREDICT TEST
test_X = df_test
y_pred_test = nuSVC.predict(test_X)

print(len(test_id))
print(test_X.shape)

# SAVE AND SUBMIT PREDICTION
submission = pd.DataFrame({'Row ID': test_id, 'Predict-Qualified': y_pred_test})
Пример #32
0
def main():
    # filepath: sentence data file path
    # vecfile: word vector file path pre-generated from other
    # vectype: compression methods. Average, avg+tf-idf one line, agg+tf-idf whole data
    # vec_path: vector file save path

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/stem_testdata'  # 'data/data_test'
    vecfile = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt'

    vec_files = [
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.100d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.200d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.42B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.840B.300d.txt'
    ]
    # don't know why yet, relative file path having permission deny
    # so we're using absolute path for now
    vec_path = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/word_vector/'

    # Here, we can choose type of vectorization
    # there are 6 word vector file downloaded from glove
    """
    vectype = 1
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path+name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 2
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_OnelineTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 3
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_WholeDataTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))
    """

    # from here, will earase.

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    #filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/hyp1-hyp2-ref'
    vectype = 1
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_diffOrder'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 2
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_OnelineTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 3
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_WholeDataTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    vec_path = 'data/word_vector/glove.6B.50d_vec_diffOrder'
    wvec = load_wordvec(vec_path)
    target_path = 'data/dev.answers'
    answer = load_target(target_path)

    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import ExtraTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import NuSVC
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import LinearSVC

    clf1 = KNeighborsClassifier()
    clf2 = DecisionTreeClassifier()
    clf3 = ExtraTreeClassifier()
    clf4 = MLPClassifier()
    clf5nu = NuSVC()
    clf6lin = LinearSVC()
    # 'sag', 'saga' and 'lbfgs' ’

    print("Training Starts")
    X_train, X_test, y_train, y_test = train_test_split(wvec,
                                                        answer,
                                                        test_size=0.10,
                                                        random_state=42)
    #clf1.fit(X_train, y_train)
    clf1.fit(X_train, y_train)
    print('KNeighborsClassifier score 50d', clf1.score(X_test, y_test))
    clf2.fit(X_train, y_train)
    print('DecisionTreeClassifier score 50d', clf2.score(X_test, y_test))
    clf3.fit(X_train, y_train)
    print('ExtraTreeClassifier score 50d', clf3.score(X_test, y_test))
    clf4.fit(X_train, y_train)
    print('MLPClassifier score 50d', clf4.score(X_test, y_test))

    clf1 = OneVsRestClassifier(KNeighborsClassifier())
    clf2 = OneVsRestClassifier(DecisionTreeClassifier())
    clf3 = OneVsRestClassifier(ExtraTreeClassifier())
    clf4 = OneVsRestClassifier(MLPClassifier())
    clf5 = OneVsOneClassifier(NuSVC())
    clf6 = OneVsRestClassifier(LinearSVC())

    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    clf7 = OneVsRestClassifier(SGDClassifier())
    clf8 = OneVsRestClassifier(Perceptron())
    clf9 = OneVsRestClassifier(PassiveAggressiveClassifier())

    print('One vs Rest methods case::')
    print('KNeighborsClassifier score 50d',
          clf1.fit(X_train, y_train).score(X_test, y_test))
    print('DecisionTreeClassifier score 50d',
          clf2.fit(X_train, y_train).score(X_test, y_test))
    print('ExtraTreeClassifier score 50d',
          clf3.fit(X_train, y_train).score(X_test, y_test))
    print('MLPClassifier score 50d',
          clf4.fit(X_train, y_train).score(X_test, y_test))

    print('SGDClassifier score 50d',
          clf7.fit(X_train, y_train).score(X_test, y_test))
    print('Perceptron score 50d',
          clf8.fit(X_train, y_train).score(X_test, y_test))
    print('PassiveAggressiveClassifier score 50d',
          clf9.fit(X_train, y_train).score(X_test, y_test))

    print('NuSVC score 50d', clf5.fit(X_train, y_train).score(X_test, y_test))
    print('LinearSVC score 50d',
          clf6.fit(X_train, y_train).score(X_test, y_test))

    clf5nu.fit(X_train, y_train)
    print('NuSVC score 50d', clf5nu.score(X_test, y_test))
    clf6lin.fit(X_train, y_train)
    print('LinearSVC score 50d', clf6lin.score(X_test, y_test))

    from sklearn.datasets import make_friedman1
    from sklearn.feature_selection import RFECV
    from sklearn.neighbors import KNeighborsClassifier
    estimator = DecisionTreeClassifier()
Пример #33
0
svc_new.fit(train_x_reduced, train_y_practice)
print svc_new.score(test_x_reduced, test_y_practice)
"""
"""
parameters = {'degree':(1, 3, 6)}
svclass = NuSVC(kernel='poly', probability=True, gamma=0, nu=.5852, tol=.00001)
clf = GridSearchCV(svclass, parameters, cv=10)
clf.fit(train_x_reduced, train_y_practice)
print "SVC"
print clf.best_estimator_
print clf.best_score_
print clf.best_params_
"""
svc_new = NuSVC(kernel='poly', probability=True, gamma=0, nu=.5852, tol=.00001)
svc_new.fit(train_x_reduced, train_y_practice)
print svc_new.score(test_x_reduced, test_y_practice)


print 'Predicting'
estimator = SelectKBest(score_func=f_classif, k=components)
estimator.fit(train_x, train_y_leaderboard)
train_x_reduced = estimator.transform(train_x)
test_x_reduced = estimator.transform(test_x)
print train_x.shape
print train_x_reduced.shape

#svc_new = SVC(probability=True, C=.000001, kernel='poly', gamma=4,
#                  degree=4)
svc_new = NuSVC(kernel='poly', probability=True, gamma=0, nu=.5852, tol=.00001)
svc_new.fit(train_x_reduced, train_y_leaderboard)
output = svc_new.predict(test_x_reduced)
Пример #34
0
clf_SVML = LinearSVC(random_state=0, tol=1e-5)
ST.Reset()
clf_SVML.fit(X, y)
ST.RP()
clf_SVML.score(X, y)
ST.Print()
print(" Linear SVM Accuarcy for Digits : "+ str(clf_SVML.score(X, y)*100))

from sklearn.svm import NuSVC

clf_SVMN =  NuSVC(gamma=0.001)
ST.Reset()
clf_SVMN.fit(X, y)
ST.RP()
clf_SVMN.score(X, y)
ST.Print()
print(" RBF Accuarcy for Digits : "+ str(clf_SVMN.score(X, y)*100))


from sklearn import tree
clf_T = tree.DecisionTreeClassifier()
ST.Reset()
clf_T.fit(X, y)
ST.RP()
clf_T.score(X, y)
ST.Print()
print(" Decision Tree Accuarcy for Digits : "+ str(clf_T.score(X, y)*100))

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)