예제 #1
0
class NaiveBayes:
    def __init__(self):
        """
        Algorithme Naïve Bayes Bernoulli
        
        """
        self.lissage = 1.1  # Lissage des données d'entrée

    def recherche_hyper(self, x_tr, t_tr):
        """
        Recherche d'hyperparamètres pour Naïve Bayes Bernoulli
        
        x_train: Numpy array avec données d'entraînement
        t_train: Numpy array avec cibles pour l'entraînement

        Méthode de Grid Search: 
            prof_max: Profondeur maximale entre 10 et 50
            msf: Nombre minimal d'échantillons dans une feuille entre 2 et 10
            Mesure de la qualité de la séparation: giny et entropy
        
        Retourne un dictionnaire avec les meilleurs hyperparamètres
        """
        valeurs_liss = np.arange(0.0, 1.0, 0.01)
        p_grid = [{'alpha': valeurs_liss}]

        cross_v = KFold(10, True)  # validation croisée

        # Recherche d'hyperparamètres
        self.classif = GridSearchCV(estimator=BernoulliNB(),\
                                          param_grid=p_grid, cv=cross_v)
        self.classif.fit(x_tr, t_tr)

        mei_param = self.classif.best_params_

        return mei_param

    def entrainement(self, x_train, t_train, cherche_hyp):
        """
        Entraînement avec Naïve Bayes Bernoulli
        
        x_train: Numpy array avec données d'entraînement
        t_train: Numpy array avec cibles pour l'entraînement
        cherche_hyp: Chercher ou non les meilleures hyperparamètres
        
        Retourne un objet avec le modèle entraîné
        """

        if cherche_hyp == True:
            print(
                'Debut de l\'entrainement NB avec recherche d\'hyperparamètres',
                '\n')
            parametres = self.recherche_hyper(x_train, t_train)
        else:
            print(
                'Debut de l\'entrainement NB sans recherche d\'hyperparamètres',
                '\n')
            parametres = {'alpha': self.lissage}

        self.classif = BernoulliNB(**parametres)

        print('Paramètres utilisés pour l\'entraînement NB :',\
              self.classif.get_params(),'\n')

        return self.classif.fit(x_train, t_train)

    def prediction(self, x_p):
        """
        Prédiction avec Naïve Bayes Bernoulli
        
        x_p = Numpy array avec données pour trouver la prédiction
        
        Retourne les cibles t_p pour x_p et leur score
        """
        self.t_p = self.classif.predict(x_p)
        return self.t_p
예제 #2
0
파일: suba.py 프로젝트: myhrbeu/py-har
class SubspaceAlignedClassifier(object):
    """
    Class of classifiers based on Subspace Alignment.

    Methods contain the alignment itself, classifiers and general utilities.
    """
    def __init__(self, loss='logistic', l2=1.0, num_components=1):
        """
        Select a particular type of subspace aligned classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) int 'num_components': number of transfer components to
                    maintain (def: 1)
        """
        self.loss = loss
        self.l2 = l2
        self.num_components = num_components

        # Initialize untrained classifiers
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf = tree.DecisionTreeClassifier()
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf = BernoulliNB()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Maintain target principal component coefficients
        self.CZ = ''

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def subspace_alignment(self, X, Z, num_components=1):
        """
        Compute subspace and alignment matrix.

        INPUT   (1) array 'X': source data set (N samples by D features)
                (2) array 'Z': target data set (M samples by D features)
                (3) int 'num_components': number of components (def: 1)
        OUTPUT  (1) array 'V': transformation matrix (D features by D features)
                (2) array 'CX': source principal component coefficients
                (3) array 'CZ': target principal component coefficients
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute principal components
        CX = PCA(n_components=num_components, whiten=True).fit(X).components_.T
        CZ = PCA(n_components=num_components, whiten=True).fit(Z).components_.T

        # Aligned source components
        V = np.dot(CX.T, CZ)

        # Return transformation matrix and principal component coefficients
        return V, CX, CZ

    def fit(self, X, y, Z):
        """
        Fit/train a classifier on data mapped onto transfer components.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        OUTPUT  None
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Transfer component analysis (store target subspace)
        V, CX, self.CZ = self.subspace_alignment(
            X, Z, num_components=self.num_components)

        # Map source data onto source principal components
        X = np.dot(X, CX)

        # Align source data to target subspace
        X = np.dot(X, V)

        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf.fit(X, y)
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf.fit(X, y)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_, whiten=False):
        """
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
                (2) boolean 'whiten': whether to whiten new data (def: false)
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Check for need to whiten data beforehand
        if whiten:
            Z_ = st.zscore(Z_)

        # Map new target data onto target subspace
        Z_ = np.dot(Z_, self.CZ)

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds) + 1) / 2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained

    # add. by pb
    def predict_proba(self, Z_, whiten=False):
        """
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
                (2) boolean 'whiten': whether to whiten new data (def: false)
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Check for need to whiten data beforehand
        if whiten:
            Z_ = st.zscore(Z_)

        # Map new target data onto target subspace
        Z_ = np.dot(Z_, self.CZ)

        # Call scikit's predict function
        preds = self.clf.predict_proba(Z_)

        # For quadratic loss function, correct predictions
        #if self.loss == 'quadratic':
        #    preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds
예제 #3
0
class NaiveBayesModel():

    #Constructor
    def __init__(self, trainFilePath, testFilePath):
        #Initialize model variables
        with open(trainFilePath) as train:
            self.trainData = json.load(train)
        with open(testFilePath) as test:
            self.testData = json.load(test)
        self.uniqueIngredients, self.numUnique = self.getUniqueIngredients(
            self.trainData)
        self.trainVectors, self.trainLabels = self.getTrainVectors(
            self.trainData)
        # self.testVectors, self.testLabels = self.getTestVectors(self.testData)
        self.model = BernoulliNB()

    #Get Model Hyperparameters
    def getInfo(self):
        print(self.model.get_params())

    #Remove all samples with less than 5 ingredients
    def cleanData(self, data):
        newData = []
        for entry in data:
            if (len(entry['ingredients']) > 5):
                newData.append(entry)
        return newData

    #Get number of unique ingredients
    def getUniqueIngredients(self, trainData):
        ingredientsDictionary = {}
        uniqueIndentifier = 1
        for item in self.trainData:
            for ingredient in item["ingredients"]:
                if ingredient not in ingredientsDictionary:
                    ingredientsDictionary[ingredient] = uniqueIndentifier
                    uniqueIndentifier += 1
        return ingredientsDictionary, uniqueIndentifier

    #Get train vectors
    def getTrainVectors(self, trainData):
        #Create trainVectors list and labels list
        trainLabels = [vector["cuisine"] for vector in self.trainData]
        trainVectors = []
        for item in trainData:
            featureVector = [0.0] * self.numUnique
            for ingredient in item["ingredients"]:
                uniqueId = self.uniqueIngredients[ingredient]
                featureVector[uniqueId] = 1
            trainVectors.append(featureVector)
        return trainVectors, trainLabels

    #Get test vectors
    def getTestVectors(self, testData):
        testVectors = []
        testLabels = [vector["cuisine"] for vector in testData]
        for item in testData:
            featureVector = [0.0] * self.numUnique
            for ingredient in item["ingredients"]:
                if ingredient in self.uniqueIngredients:
                    uniqueId = self.uniqueIngredients[ingredient]
                    featureVector[uniqueId] = 1
            testVectors.append(featureVector)
        return testVectors, testLabels

    #Train model on trainData
    def trainModel(self):
        self.model.fit(self.trainVectors, self.trainLabels)

    #Make predictions on testData
    def predict(self):
        predictions = self.model.predict(self.testVectors)
        numCorrect = 0
        totalSamples = len(self.testLabels)
        for prediction, trueLabel in zip(predictions, self.testLabels):
            if (prediction == trueLabel):
                numCorrect += 1
        print("Accuracy on validation set: %.2f%%" %
              (100 * (numCorrect / totalSamples)))

    #Predict given one single vector
    def predictOnSample(self, testVector):
        totalTests = []
        featureVector = [0.0] * self.numUnique
        featureCount = 0
        for ingredient in testVector:
            if ingredient in self.uniqueIngredients:
                uniqueId = self.uniqueIngredients[ingredient]
                featureVector[uniqueId] = 1
        totalTests.append(featureVector)
        return self.model.predict(totalTests)
예제 #4
0
class ImportanceWeightedClassifier(object):
    """
    Class of importance-weighted classifiers.

    Methods contain different importance-weight estimators and different loss
    functions.
    """
    def __init__(self,
                 loss='logistic',
                 l2=1.0,
                 iwe='lr',
                 smoothing=True,
                 clip=-1,
                 kernel_type='rbf',
                 bandwidth=1):
        """
        Select a particular type of importance-weighted classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) str 'iwe': importance weight estimator, options: 'lr',
                    'nn', 'rg', 'kmm', 'kde' (def: 'lr')
                (4) boolean 'smoothing': whether to apply Laplace smoothing to
                    the nearest-neighbour importance-weight estimator
                    (def: True)
                (5) float 'clip': maximum allowable importance-weight value; if
                    set to -1, then the weights are not clipped (def:-1)
                (6) str 'kernel_type': what type of kernel to use for kernel
                    density estimation or kernel mean matching, options:
                    'diste', 'rbf' (def: 'rbf')
                (7) float 'bandwidth': kernel bandwidth parameter value for
                    kernel-based weight estimators (def: 1)
        """
        self.loss = loss
        self.l2 = l2
        self.iwe = iwe
        self.smoothing = smoothing
        self.clip = clip
        self.kernel_type = kernel_type
        self.bandwidth = bandwidth

        # Initialize untrained classifiers based on choice of loss function
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf = tree.DecisionTreeClassifier()
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf = BernoulliNB()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def iwe_ratio_gaussians(self, X, Z):
        """
        Estimate importance weights based on a ratio of Gaussian distributions.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Sample means in each domain
        mu_X = np.mean(X, axis=0)
        mu_Z = np.mean(Z, axis=0)

        # Sample covariances
        Si_X = np.cov(X.T)
        Si_Z = np.cov(Z.T)

        # Check for positive-definiteness of covariance matrices
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
            print('Warning: covariate matrices not PSD.')

            regct = -6
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
                print('Adding regularization: ' + str(1**regct))

                # Add regularization
                Si_X += np.eye(DX) * 10.**regct
                Si_Z += np.eye(DZ) * 10.**regct

                # Increment regularization counter
                regct += 1

        # Compute probability of X under each domain
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)

        # Check for numerics
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)

        # Return the ratio of probabilities
        return pT / pS

    def iwe_kernel_densities(self, X, Z):
        """
        Estimate importance weights based on kernel density estimation.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute probabilities based on source kernel densities
        pT = st.gaussian_kde(Z.T).pdf(X.T)
        pS = st.gaussian_kde(X.T).pdf(X.T)

        # Check for numerics
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)

        # Return the ratio of probabilities
        return pT / pS

    def iwe_logistic_discrimination(self, X, Z):
        """
        Estimate importance weights based on logistic regression.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Make domain-label variable
        y = np.concatenate((np.zeros((N, 1)), np.ones((M, 1))), axis=0)

        # Concatenate data
        XZ = np.concatenate((X, Z), axis=0)

        # Call a logistic regressor
        lr = LogisticRegression(C=self.l2)

        # Predict probability of belonging to target using cross-validation
        preds = cross_val_predict(lr, XZ, y[:, 0])

        # Return predictions for source samples
        return preds[:N]

    def iwe_nearest_neighbours(self, X, Z):
        """
        Estimate importance weights based on nearest-neighbours.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute Euclidean distance between samples
        d = cdist(X, Z, metric='euclidean')

        # Count target samples within each source Voronoi cell
        ix = np.argmin(d, axis=1)
        iw, _ = np.array(np.histogram(ix, np.arange(N + 1)))

        # Laplace smoothing
        if self.smoothing:
            iw = (iw + 1.) / (N + 1)

        # Weight clipping
        if self.clip > 0:
            iw = np.minimum(self.clip, np.maximum(0, iw))

        # Return weights
        return iw

    def iwe_kernel_mean_matching(self, X, Z):
        """
        Estimate importance weights based on kernel mean matching.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute sample pairwise distances
        KXX = cdist(X, X, metric='euclidean')
        KXZ = cdist(X, Z, metric='euclidean')

        # Assert non-negative distances
        assert np.all(KXX >= 0)
        assert np.all(KXZ >= 0)

        # Compute kernels
        if self.kernel_type == 'rbf':
            # Radial basis functions
            KXX = np.exp(-KXX / (2 * self.bandwidth**2))
            KXZ = np.exp(-KXZ / (2 * self.bandwidth**2))

        # Collapse second kernel and normalize
        KXZ = N / M * np.sum(KXZ, axis=1)

        # Prepare for CVXOPT
        Q = matrix(KXX, tc='d')
        p = matrix(KXZ, tc='d')
        G = matrix(np.concatenate((np.ones((1, N)), -1 * np.ones(
            (1, N)), -1. * np.eye(N)),
                                  axis=0),
                   tc='d')
        h = matrix(np.concatenate(
            (np.array([N / np.sqrt(N) + N], ndmin=2),
             np.array([N / np.sqrt(N) - N], ndmin=2), np.zeros((N, 1))),
            axis=0),
                   tc='d')

        # Call quadratic program solver
        sol = solvers.qp(Q, p, G, h)

        # Return optimal coefficients as importance weights
        return np.array(sol['x'])[:, 0]

    def fit(self, X, y, Z):
        """
        Fit/train an importance-weighted classifier.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        OUTPUT  None
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Find importance-weights
        if self.iwe == 'lr':
            w = self.iwe_logistic_discrimination(X, Z)
        elif self.iwe == 'rg':
            w = self.iwe_ratio_gaussians(X, Z)
        elif self.iwe == 'nn':
            w = self.iwe_nearest_neighbours(X, Z)
        elif self.iwe == 'kde':
            w = self.iwe_kernel_densities(X, Z)
        elif self.iwe == 'kmm':
            w = self.iwe_kernel_mean_matching(X, Z)
        else:
            raise NotImplementedError
        print("self.loss=", str(self.loss))
        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf.fit(X, y, w)
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf.fit(X, y, w)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_):
        """
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds) + 1) / 2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained
예제 #5
0
파일: tca.py 프로젝트: myhrbeu/py-har
class TransferComponentClassifier(object):
    """
    Class of classifiers based on Transfer Component Analysis.

    Methods contain component analysis and general utilities.
    """

    def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1,
                 kernel_type='rbf', bandwidth=1.0, order=2.0):
        """
        Select a particular type of transfer component classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) float 'mu': trade-off parameter (def: 1.0)
                (4) int 'num_components': number of transfer components to
                    maintain (def: 1)
                (5) str 'kernel_type': type of kernel to use, options: 'rbf'
                    (def: 'rbf')
                (6) float 'bandwidth': kernel bandwidth for transfer component
                    analysis (def: 1.0)
                (7) float 'order': order of polynomial for kernel (def: 2.0)
        """
        self.loss = loss
        self.l2 = l2
        self.mu = mu
        self.num_components = num_components

        self.kernel_type = kernel_type
        self.bandwidth = bandwidth
        self.order = order

        # Initialize untrained classifiers
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf = tree.DecisionTreeClassifier()
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf = BernoulliNB()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Maintain source and transfer data for computing kernels
        self.XZ = ''

        # Maintain transfer components
        self.C = ''

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0):
        """
        Compute kernel for given data set.

        INPUT   (1) array 'X': data set (N samples by D features)
                (2) array 'Z': data set (M samples by D features)
                (3) str 'type': type of kernel, options: 'linear',
                    'polynomial', 'rbf', 'sigmoid' (def: 'linear')
                (4) float 'order': order of polynomial to use for the
                    polynomial kernel (def: 2.0)
                (5) float 'bandwidth': kernel bandwidth (def: 1.0)
        OUTPUT  (1) array: kernel matrix (N+M by N+M)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Select type of kernel to compute
        if type == 'linear':
            # Linear kernel is data outer product
            return np.dot(X, Z.T)
        elif type == 'polynomial':
            # Polynomial kernel is an exponentiated data outer product
            return (np.dot(X, Z.T) + 1)**p
        elif type == 'rbf':
            # Radial basis function kernel
            return np.exp(-cdist(X, Z) / (2.*bandwidth**2))
        elif type == 'sigmoid':
            # Sigmoidal kernel
            return 1./(1 + np.exp(np.dot(X, Z.T)))
        else:
            raise NotImplementedError

    def transfer_component_analysis(self, X, Z):
        """
        Transfer Component Analysis.

        INPUT   (1) array 'X': source data set (N samples by D features)
                (2) array 'Z': target data set (M samples by D features)
        OUTPUT  (1) array 'C': transfer components (D features
                    by num_components)
                (2) array 'K': source and target data kernel distances
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute kernel matrix
        XZ = np.concatenate((X, Z), axis=0)
        K = self.kernel(XZ, XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth)

        # Ensure positive-definiteness
        if not is_pos_def(K):
            print('Warning: covariate matrices not PSD.')

            regct = -6
            while not is_pos_def(K):
                print('Adding regularization: ' + str(10**regct))

                # Add regularization
                K += np.eye(N + M)*10.**regct

                # Increment regularization counter
                regct += 1

        # Normalization matrix
        L = np.vstack((np.hstack((np.ones((N, N))/N**2,
                                  -1*np.ones((N, M))/(N*M))),
                       np.hstack((-1*np.ones((M, N))/(N*M),
                                  np.ones((M, M))/M**2))))

        # Centering matrix
        H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M)

        # Matrix Lagrangian objective function: (I + mu*K*L*K)^{-1}*K*H*K
        J = np.dot(np.linalg.inv(np.eye(N + M) +
                   self.mu*np.dot(np.dot(K, L), K)),
                   np.dot(np.dot(K, H), K))

        # Eigenvector decomposition as solution to trace minimization
        _, C = eigs(J, k=self.num_components)

        # Discard imaginary numbers (possible computation issue)
        return np.real(C), K

    def fit(self, X, y, Z):
        """
        Fit/train a classifier on data mapped onto transfer components.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        OUTPUT
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Assert correct number of components for given dataset
        assert self.num_components <= N + M - 1

        # Maintain source and target data for later kernel computations
        self.XZ = np.concatenate((X, Z), axis=0)

        # Transfer component analysis
        self.C, K = self.transfer_component_analysis(X, Z)

        # Map source data onto transfer components
        X = np.dot(K[:N, :], self.C)
        
        print("self.loss:", str(self.loss))
        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf.fit(X, y)
        elif self.loss == 'berno':
             # BernoulliNB
            self.clf.fit(X, y)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_):
        """
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Compute kernel for new data
        K = self.kernel(Z_, self.XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth, order=self.order)

        # Map new data onto transfer components
        Z_ = np.dot(K, self.C)

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained
    
        # add. by pb
    def predict_proba(self, Z_, whiten=False):
        """
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Compute kernel for new data
        K = self.kernel(Z_, self.XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth, order=self.order)

        # Map new data onto transfer components
        Z_ = np.dot(K, self.C)

        # Call scikit's predict function
        preds = self.clf.predict_proba(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds
예제 #6
0
arrTrainLabels = arrTrain[:,1:2].astype(int) 
# print(arrTrainLabels)
# arrTrainFeatures = arrTrain[:,[2,3,4,5]]
arrTrainFeatures = arrTrain[:,[2,3,4,5,6,7,8,9,10,11,12]]
# print(arrTrainFeatures)

model = BernoulliNB(
      alpha=1.0
    , binarize=0.0
    , class_prior=None
    , fit_prior=True
    ).fit(arrTrainFeatures, arrTrainLabels)

print(model)
print(model.get_params())
arrTest = dfTest.as_matrix().astype(int)
arrTestLabels = arrTest[0:100,1]
# arrTestFeatures = arrTest[0:10000,[2,3,4,5]]
arrTestFeatures = arrTest[0:100,[2,3,4,5,6,7,8,9,10,11,12]]

pred_proba = model.predict_proba(arrTestFeatures)
print(pred_proba)

pred = model.predict(arrTestFeatures)
print(pred)

for i in range(len(arrTestLabels)):
	print(arrTestLabels[i:i+1,][0],"-", pred[i,])

pred_score = model.score(arrTestFeatures, arrTestLabels)
예제 #7
0
cm = confusion_matrix(Yset_test, y_pred)
Accuracy = ((cm[0, 0] + cm[1, 1]) /
            (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])) * 100
Precision = ((cm[0, 0]) / (cm[0, 0] + cm[1, 0])) * 100
Recall = ((cm[0, 0]) / (cm[0, 0] + cm[0, 1])) * 100
print("Confusion Matrix: \n%s " % (cm))
print("Accuracy: %.2f%% ; Precision: %.2f%% ; Recall: %.2f%%" %
      (Accuracy, Precision, Recall))

plot_confusion_matrix(
    Yset_test,
    y_pred,
    classes=class_names,
    title='Confusion matrix, Bernoulli NB important features')

bnb.get_params()

rndF = RandomForestClassifier()
y_pred = rndF.fit(Xset_train, Yset_train).predict(Xset_test)
print(
    "RandomForestClassifier: Number of mislabeled points out of a total %d points : %d"
    % (Xset_train.shape[0], (Yset_test != y_pred).sum()))

cm = confusion_matrix(Yset_test, y_pred)
Accuracy = ((cm[0, 0] + cm[1, 1]) /
            (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])) * 100
Precision = ((cm[0, 0]) / (cm[0, 0] + cm[1, 0])) * 100
Recall = ((cm[0, 0]) / (cm[0, 0] + cm[0, 1])) * 100
print("Confusion Matrix: \n%s " % (cm))
print("Accuracy: %.2f%% ; Precision: %.2f%% ; Recall: %.2f%%" %
      (Accuracy, Precision, Recall))
import numpy as np
from sklearn.naive_bayes import BernoulliNB
X = np.array([[1, 2, 3, 4], [1, 3, 4, 4], [2, 4, 5, 5]])
y = np.array([1, 1, 2])
clf = BernoulliNB(alpha=1, class_prior=None, binarize=2.0, fit_prior=False)
clf.fit(X, y, sample_weight=None)  #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组
print(clf.class_log_prior_)
print(X)
#class_log_prior_:各类标记的平滑先验概率对数值,其取值会受fit_prior和class_prior参数的影响,三种情况
#若指定了class_prior参数,不管fit_prior为True或False,class_log_prior_取值是class_prior转换成log后的结果
#若fit_prior参数为False,class_prior=None,则各类标记的先验概率相同等于类标记总个数N分之一
#若fit_prior参数为True,class_prior=None,则各类标记的先验概率相同等于各类标记个数除以各类标记个数之和
print(clf.class_count_)  #class_count_属性:获取各类标记对应的训练样本数
print(clf.feature_count_)  #:各类别各个特征出现的次数,返回形状为(n_classes, n_features)数组)
print(clf.get_params(deep=True))  #get_params(deep=True):返回priors与其参数值组成字典
print(clf.predict_log_proba([[3, 4, 5, 4], [1, 3, 5, 6]
                             ]))  #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值
print(clf.predict_proba([[3, 4, 5, 4],
                         [1, 3, 5, 6]]))  #predict_proba(X):输出测试样本在各个类标记预测概率值
print(clf.score([[3, 4, 5, 4], [1, 3, 5, 6]],
                [1, 1]))  #score(X, y, sample_weight=None):输出对测试样本的预测准确率的平均值
clf.set_params(alpha=2.0)  #set_params(**params):设置估计器参数
print(clf.get_params(deep=True))