class NaiveBayes: def __init__(self): """ Algorithme Naïve Bayes Bernoulli """ self.lissage = 1.1 # Lissage des données d'entrée def recherche_hyper(self, x_tr, t_tr): """ Recherche d'hyperparamètres pour Naïve Bayes Bernoulli x_train: Numpy array avec données d'entraînement t_train: Numpy array avec cibles pour l'entraînement Méthode de Grid Search: prof_max: Profondeur maximale entre 10 et 50 msf: Nombre minimal d'échantillons dans une feuille entre 2 et 10 Mesure de la qualité de la séparation: giny et entropy Retourne un dictionnaire avec les meilleurs hyperparamètres """ valeurs_liss = np.arange(0.0, 1.0, 0.01) p_grid = [{'alpha': valeurs_liss}] cross_v = KFold(10, True) # validation croisée # Recherche d'hyperparamètres self.classif = GridSearchCV(estimator=BernoulliNB(),\ param_grid=p_grid, cv=cross_v) self.classif.fit(x_tr, t_tr) mei_param = self.classif.best_params_ return mei_param def entrainement(self, x_train, t_train, cherche_hyp): """ Entraînement avec Naïve Bayes Bernoulli x_train: Numpy array avec données d'entraînement t_train: Numpy array avec cibles pour l'entraînement cherche_hyp: Chercher ou non les meilleures hyperparamètres Retourne un objet avec le modèle entraîné """ if cherche_hyp == True: print( 'Debut de l\'entrainement NB avec recherche d\'hyperparamètres', '\n') parametres = self.recherche_hyper(x_train, t_train) else: print( 'Debut de l\'entrainement NB sans recherche d\'hyperparamètres', '\n') parametres = {'alpha': self.lissage} self.classif = BernoulliNB(**parametres) print('Paramètres utilisés pour l\'entraînement NB :',\ self.classif.get_params(),'\n') return self.classif.fit(x_train, t_train) def prediction(self, x_p): """ Prédiction avec Naïve Bayes Bernoulli x_p = Numpy array avec données pour trouver la prédiction Retourne les cibles t_p pour x_p et leur score """ self.t_p = self.classif.predict(x_p) return self.t_p
class SubspaceAlignedClassifier(object): """ Class of classifiers based on Subspace Alignment. Methods contain the alignment itself, classifiers and general utilities. """ def __init__(self, loss='logistic', l2=1.0, num_components=1): """ Select a particular type of subspace aligned classifier. INPUT (1) str 'loss': loss function for weighted classifier, options: 'logistic', 'quadratic', 'hinge' (def: 'logistic') (2) float 'l2': l2-regularization parameter value (def:0.01) (3) int 'num_components': number of transfer components to maintain (def: 1) """ self.loss = loss self.l2 = l2 self.num_components = num_components # Initialize untrained classifiers if self.loss == 'logistic': # Logistic regression model self.clf = LogisticRegression() elif self.loss == 'quadratic': # Least-squares model self.clf = LinearRegression() elif self.loss == 'hinge': # Linear support vector machine self.clf = LinearSVC() elif self.loss == 'dtree': # DecisionTreeClassifier self.clf = tree.DecisionTreeClassifier() elif self.loss == 'berno': # BernoulliNB self.clf = BernoulliNB() else: # Other loss functions are not implemented raise NotImplementedError # Maintain target principal component coefficients self.CZ = '' # Whether model has been trained self.is_trained = False # Dimensionality of training data self.train_data_dim = '' def subspace_alignment(self, X, Z, num_components=1): """ Compute subspace and alignment matrix. INPUT (1) array 'X': source data set (N samples by D features) (2) array 'Z': target data set (M samples by D features) (3) int 'num_components': number of components (def: 1) OUTPUT (1) array 'V': transformation matrix (D features by D features) (2) array 'CX': source principal component coefficients (3) array 'CZ': target principal component coefficients """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Compute principal components CX = PCA(n_components=num_components, whiten=True).fit(X).components_.T CZ = PCA(n_components=num_components, whiten=True).fit(Z).components_.T # Aligned source components V = np.dot(CX.T, CZ) # Return transformation matrix and principal component coefficients return V, CX, CZ def fit(self, X, y, Z): """ Fit/train a classifier on data mapped onto transfer components. INPUT (1) array 'X': source data (N samples by D features) (2) array 'y': source labels (N samples by 1) (3) array 'Z': target data (M samples by D features) OUTPUT None """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Transfer component analysis (store target subspace) V, CX, self.CZ = self.subspace_alignment( X, Z, num_components=self.num_components) # Map source data onto source principal components X = np.dot(X, CX) # Align source data to target subspace X = np.dot(X, V) # Train a weighted classifier if self.loss == 'logistic': # Logistic regression model with sample weights self.clf.fit(X, y) elif self.loss == 'quadratic': # Least-squares model with sample weights self.clf.fit(X, y) elif self.loss == 'hinge': # Linear support vector machine with sample weights self.clf.fit(X, y) elif self.loss == 'dtree': # DecisionTreeClassifier self.clf.fit(X, y) elif self.loss == 'berno': # BernoulliNB self.clf.fit(X, y) else: # Other loss functions are not implemented raise NotImplementedError # Mark classifier as trained self.is_trained = True # Store training data dimensionality self.train_data_dim = DX def predict(self, Z_, whiten=False): """ Make predictions on new dataset. INPUT (1) array 'Z_': new data set (M samples by D features) (2) boolean 'whiten': whether to whiten new data (def: false) OUTPUT (1) array 'preds': label predictions (M samples by 1) """ # Data shape M, D = Z_.shape # If classifier is trained, check for same dimensionality if self.is_trained: assert self.train_data_dim == D # Check for need to whiten data beforehand if whiten: Z_ = st.zscore(Z_) # Map new target data onto target subspace Z_ = np.dot(Z_, self.CZ) # Call scikit's predict function preds = self.clf.predict(Z_) # For quadratic loss function, correct predictions if self.loss == 'quadratic': preds = (np.sign(preds) + 1) / 2. # Return predictions array return preds def get_params(self): """Get classifier parameters.""" return self.clf.get_params() def is_trained(self): """Check whether classifier is trained.""" return self.is_trained # add. by pb def predict_proba(self, Z_, whiten=False): """ Make predictions on new dataset. INPUT (1) array 'Z_': new data set (M samples by D features) (2) boolean 'whiten': whether to whiten new data (def: false) OUTPUT (1) array 'preds': label predictions (M samples by 1) """ # Data shape M, D = Z_.shape # If classifier is trained, check for same dimensionality if self.is_trained: assert self.train_data_dim == D # Check for need to whiten data beforehand if whiten: Z_ = st.zscore(Z_) # Map new target data onto target subspace Z_ = np.dot(Z_, self.CZ) # Call scikit's predict function preds = self.clf.predict_proba(Z_) # For quadratic loss function, correct predictions #if self.loss == 'quadratic': # preds = (np.sign(preds)+1)/2. # Return predictions array return preds
class NaiveBayesModel(): #Constructor def __init__(self, trainFilePath, testFilePath): #Initialize model variables with open(trainFilePath) as train: self.trainData = json.load(train) with open(testFilePath) as test: self.testData = json.load(test) self.uniqueIngredients, self.numUnique = self.getUniqueIngredients( self.trainData) self.trainVectors, self.trainLabels = self.getTrainVectors( self.trainData) # self.testVectors, self.testLabels = self.getTestVectors(self.testData) self.model = BernoulliNB() #Get Model Hyperparameters def getInfo(self): print(self.model.get_params()) #Remove all samples with less than 5 ingredients def cleanData(self, data): newData = [] for entry in data: if (len(entry['ingredients']) > 5): newData.append(entry) return newData #Get number of unique ingredients def getUniqueIngredients(self, trainData): ingredientsDictionary = {} uniqueIndentifier = 1 for item in self.trainData: for ingredient in item["ingredients"]: if ingredient not in ingredientsDictionary: ingredientsDictionary[ingredient] = uniqueIndentifier uniqueIndentifier += 1 return ingredientsDictionary, uniqueIndentifier #Get train vectors def getTrainVectors(self, trainData): #Create trainVectors list and labels list trainLabels = [vector["cuisine"] for vector in self.trainData] trainVectors = [] for item in trainData: featureVector = [0.0] * self.numUnique for ingredient in item["ingredients"]: uniqueId = self.uniqueIngredients[ingredient] featureVector[uniqueId] = 1 trainVectors.append(featureVector) return trainVectors, trainLabels #Get test vectors def getTestVectors(self, testData): testVectors = [] testLabels = [vector["cuisine"] for vector in testData] for item in testData: featureVector = [0.0] * self.numUnique for ingredient in item["ingredients"]: if ingredient in self.uniqueIngredients: uniqueId = self.uniqueIngredients[ingredient] featureVector[uniqueId] = 1 testVectors.append(featureVector) return testVectors, testLabels #Train model on trainData def trainModel(self): self.model.fit(self.trainVectors, self.trainLabels) #Make predictions on testData def predict(self): predictions = self.model.predict(self.testVectors) numCorrect = 0 totalSamples = len(self.testLabels) for prediction, trueLabel in zip(predictions, self.testLabels): if (prediction == trueLabel): numCorrect += 1 print("Accuracy on validation set: %.2f%%" % (100 * (numCorrect / totalSamples))) #Predict given one single vector def predictOnSample(self, testVector): totalTests = [] featureVector = [0.0] * self.numUnique featureCount = 0 for ingredient in testVector: if ingredient in self.uniqueIngredients: uniqueId = self.uniqueIngredients[ingredient] featureVector[uniqueId] = 1 totalTests.append(featureVector) return self.model.predict(totalTests)
class ImportanceWeightedClassifier(object): """ Class of importance-weighted classifiers. Methods contain different importance-weight estimators and different loss functions. """ def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True, clip=-1, kernel_type='rbf', bandwidth=1): """ Select a particular type of importance-weighted classifier. INPUT (1) str 'loss': loss function for weighted classifier, options: 'logistic', 'quadratic', 'hinge' (def: 'logistic') (2) float 'l2': l2-regularization parameter value (def:0.01) (3) str 'iwe': importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm', 'kde' (def: 'lr') (4) boolean 'smoothing': whether to apply Laplace smoothing to the nearest-neighbour importance-weight estimator (def: True) (5) float 'clip': maximum allowable importance-weight value; if set to -1, then the weights are not clipped (def:-1) (6) str 'kernel_type': what type of kernel to use for kernel density estimation or kernel mean matching, options: 'diste', 'rbf' (def: 'rbf') (7) float 'bandwidth': kernel bandwidth parameter value for kernel-based weight estimators (def: 1) """ self.loss = loss self.l2 = l2 self.iwe = iwe self.smoothing = smoothing self.clip = clip self.kernel_type = kernel_type self.bandwidth = bandwidth # Initialize untrained classifiers based on choice of loss function if self.loss == 'logistic': # Logistic regression model self.clf = LogisticRegression() elif self.loss == 'quadratic': # Least-squares model self.clf = LinearRegression() elif self.loss == 'hinge': # Linear support vector machine self.clf = LinearSVC() elif self.loss == 'dtree': # DecisionTreeClassifier self.clf = tree.DecisionTreeClassifier() elif self.loss == 'berno': # BernoulliNB self.clf = BernoulliNB() else: # Other loss functions are not implemented raise NotImplementedError # Whether model has been trained self.is_trained = False # Dimensionality of training data self.train_data_dim = '' def iwe_ratio_gaussians(self, X, Z): """ Estimate importance weights based on a ratio of Gaussian distributions. INPUT (1) array 'X': source data (N samples by D features) (2) array 'Z': target data (M samples by D features) OUTPUT (1) array: importance weights (N samples by 1) """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Sample means in each domain mu_X = np.mean(X, axis=0) mu_Z = np.mean(Z, axis=0) # Sample covariances Si_X = np.cov(X.T) Si_Z = np.cov(Z.T) # Check for positive-definiteness of covariance matrices if not (is_pos_def(Si_X) or is_pos_def(Si_Z)): print('Warning: covariate matrices not PSD.') regct = -6 while not (is_pos_def(Si_X) or is_pos_def(Si_Z)): print('Adding regularization: ' + str(1**regct)) # Add regularization Si_X += np.eye(DX) * 10.**regct Si_Z += np.eye(DZ) * 10.**regct # Increment regularization counter regct += 1 # Compute probability of X under each domain pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z) pS = st.multivariate_normal.pdf(X, mu_X, Si_X) # Check for numerics assert not np.any(np.isnan(pT)) or np.any(pT == 0) assert not np.any(np.isnan(pS)) or np.any(pS == 0) # Return the ratio of probabilities return pT / pS def iwe_kernel_densities(self, X, Z): """ Estimate importance weights based on kernel density estimation. INPUT (1) array 'X': source data (N samples by D features) (2) array 'Z': target data (M samples by D features) OUTPUT (1) array: importance weights (N samples by 1) """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Compute probabilities based on source kernel densities pT = st.gaussian_kde(Z.T).pdf(X.T) pS = st.gaussian_kde(X.T).pdf(X.T) # Check for numerics assert not np.any(np.isnan(pT)) or np.any(pT == 0) assert not np.any(np.isnan(pS)) or np.any(pS == 0) # Return the ratio of probabilities return pT / pS def iwe_logistic_discrimination(self, X, Z): """ Estimate importance weights based on logistic regression. INPUT (1) array 'X': source data (N samples by D features) (2) array 'Z': target data (M samples by D features) OUTPUT (1) array: importance weights (N samples by 1) """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Make domain-label variable y = np.concatenate((np.zeros((N, 1)), np.ones((M, 1))), axis=0) # Concatenate data XZ = np.concatenate((X, Z), axis=0) # Call a logistic regressor lr = LogisticRegression(C=self.l2) # Predict probability of belonging to target using cross-validation preds = cross_val_predict(lr, XZ, y[:, 0]) # Return predictions for source samples return preds[:N] def iwe_nearest_neighbours(self, X, Z): """ Estimate importance weights based on nearest-neighbours. INPUT (1) array 'X': source data (N samples by D features) (2) array 'Z': target data (M samples by D features) OUTPUT (1) array: importance weights (N samples by 1) """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Compute Euclidean distance between samples d = cdist(X, Z, metric='euclidean') # Count target samples within each source Voronoi cell ix = np.argmin(d, axis=1) iw, _ = np.array(np.histogram(ix, np.arange(N + 1))) # Laplace smoothing if self.smoothing: iw = (iw + 1.) / (N + 1) # Weight clipping if self.clip > 0: iw = np.minimum(self.clip, np.maximum(0, iw)) # Return weights return iw def iwe_kernel_mean_matching(self, X, Z): """ Estimate importance weights based on kernel mean matching. INPUT (1) array 'X': source data (N samples by D features) (2) array 'Z': target data (M samples by D features) OUTPUT (1) array: importance weights (N samples by 1) """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Compute sample pairwise distances KXX = cdist(X, X, metric='euclidean') KXZ = cdist(X, Z, metric='euclidean') # Assert non-negative distances assert np.all(KXX >= 0) assert np.all(KXZ >= 0) # Compute kernels if self.kernel_type == 'rbf': # Radial basis functions KXX = np.exp(-KXX / (2 * self.bandwidth**2)) KXZ = np.exp(-KXZ / (2 * self.bandwidth**2)) # Collapse second kernel and normalize KXZ = N / M * np.sum(KXZ, axis=1) # Prepare for CVXOPT Q = matrix(KXX, tc='d') p = matrix(KXZ, tc='d') G = matrix(np.concatenate((np.ones((1, N)), -1 * np.ones( (1, N)), -1. * np.eye(N)), axis=0), tc='d') h = matrix(np.concatenate( (np.array([N / np.sqrt(N) + N], ndmin=2), np.array([N / np.sqrt(N) - N], ndmin=2), np.zeros((N, 1))), axis=0), tc='d') # Call quadratic program solver sol = solvers.qp(Q, p, G, h) # Return optimal coefficients as importance weights return np.array(sol['x'])[:, 0] def fit(self, X, y, Z): """ Fit/train an importance-weighted classifier. INPUT (1) array 'X': source data (N samples by D features) (2) array 'y': source labels (N samples by 1) (3) array 'Z': target data (M samples by D features) OUTPUT None """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Find importance-weights if self.iwe == 'lr': w = self.iwe_logistic_discrimination(X, Z) elif self.iwe == 'rg': w = self.iwe_ratio_gaussians(X, Z) elif self.iwe == 'nn': w = self.iwe_nearest_neighbours(X, Z) elif self.iwe == 'kde': w = self.iwe_kernel_densities(X, Z) elif self.iwe == 'kmm': w = self.iwe_kernel_mean_matching(X, Z) else: raise NotImplementedError print("self.loss=", str(self.loss)) # Train a weighted classifier if self.loss == 'logistic': # Logistic regression model with sample weights self.clf.fit(X, y, w) elif self.loss == 'quadratic': # Least-squares model with sample weights self.clf.fit(X, y, w) elif self.loss == 'hinge': # Linear support vector machine with sample weights self.clf.fit(X, y, w) elif self.loss == 'dtree': # DecisionTreeClassifier self.clf.fit(X, y, w) elif self.loss == 'berno': # BernoulliNB self.clf.fit(X, y, w) else: # Other loss functions are not implemented raise NotImplementedError # Mark classifier as trained self.is_trained = True # Store training data dimensionality self.train_data_dim = DX def predict(self, Z_): """ Make predictions on new dataset. INPUT (1) array 'Z_': new data set (M samples by D features) OUTPUT (2) array 'preds': label predictions (M samples by 1) """ # Data shape M, D = Z_.shape # If classifier is trained, check for same dimensionality if self.is_trained: assert self.train_data_dim == D # Call scikit's predict function preds = self.clf.predict(Z_) # For quadratic loss function, correct predictions if self.loss == 'quadratic': preds = (np.sign(preds) + 1) / 2. # Return predictions array return preds def get_params(self): """Get classifier parameters.""" return self.clf.get_params() def is_trained(self): """Check whether classifier is trained.""" return self.is_trained
class TransferComponentClassifier(object): """ Class of classifiers based on Transfer Component Analysis. Methods contain component analysis and general utilities. """ def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1, kernel_type='rbf', bandwidth=1.0, order=2.0): """ Select a particular type of transfer component classifier. INPUT (1) str 'loss': loss function for weighted classifier, options: 'logistic', 'quadratic', 'hinge' (def: 'logistic') (2) float 'l2': l2-regularization parameter value (def:0.01) (3) float 'mu': trade-off parameter (def: 1.0) (4) int 'num_components': number of transfer components to maintain (def: 1) (5) str 'kernel_type': type of kernel to use, options: 'rbf' (def: 'rbf') (6) float 'bandwidth': kernel bandwidth for transfer component analysis (def: 1.0) (7) float 'order': order of polynomial for kernel (def: 2.0) """ self.loss = loss self.l2 = l2 self.mu = mu self.num_components = num_components self.kernel_type = kernel_type self.bandwidth = bandwidth self.order = order # Initialize untrained classifiers if self.loss == 'logistic': # Logistic regression model self.clf = LogisticRegression() elif self.loss == 'quadratic': # Least-squares model self.clf = LinearRegression() elif self.loss == 'hinge': # Linear support vector machine self.clf = LinearSVC() elif self.loss == 'dtree': # DecisionTreeClassifier self.clf = tree.DecisionTreeClassifier() elif self.loss == 'berno': # BernoulliNB self.clf = BernoulliNB() else: # Other loss functions are not implemented raise NotImplementedError # Maintain source and transfer data for computing kernels self.XZ = '' # Maintain transfer components self.C = '' # Whether model has been trained self.is_trained = False # Dimensionality of training data self.train_data_dim = '' def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0): """ Compute kernel for given data set. INPUT (1) array 'X': data set (N samples by D features) (2) array 'Z': data set (M samples by D features) (3) str 'type': type of kernel, options: 'linear', 'polynomial', 'rbf', 'sigmoid' (def: 'linear') (4) float 'order': order of polynomial to use for the polynomial kernel (def: 2.0) (5) float 'bandwidth': kernel bandwidth (def: 1.0) OUTPUT (1) array: kernel matrix (N+M by N+M) """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Select type of kernel to compute if type == 'linear': # Linear kernel is data outer product return np.dot(X, Z.T) elif type == 'polynomial': # Polynomial kernel is an exponentiated data outer product return (np.dot(X, Z.T) + 1)**p elif type == 'rbf': # Radial basis function kernel return np.exp(-cdist(X, Z) / (2.*bandwidth**2)) elif type == 'sigmoid': # Sigmoidal kernel return 1./(1 + np.exp(np.dot(X, Z.T))) else: raise NotImplementedError def transfer_component_analysis(self, X, Z): """ Transfer Component Analysis. INPUT (1) array 'X': source data set (N samples by D features) (2) array 'Z': target data set (M samples by D features) OUTPUT (1) array 'C': transfer components (D features by num_components) (2) array 'K': source and target data kernel distances """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Compute kernel matrix XZ = np.concatenate((X, Z), axis=0) K = self.kernel(XZ, XZ, type=self.kernel_type, bandwidth=self.bandwidth) # Ensure positive-definiteness if not is_pos_def(K): print('Warning: covariate matrices not PSD.') regct = -6 while not is_pos_def(K): print('Adding regularization: ' + str(10**regct)) # Add regularization K += np.eye(N + M)*10.**regct # Increment regularization counter regct += 1 # Normalization matrix L = np.vstack((np.hstack((np.ones((N, N))/N**2, -1*np.ones((N, M))/(N*M))), np.hstack((-1*np.ones((M, N))/(N*M), np.ones((M, M))/M**2)))) # Centering matrix H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M) # Matrix Lagrangian objective function: (I + mu*K*L*K)^{-1}*K*H*K J = np.dot(np.linalg.inv(np.eye(N + M) + self.mu*np.dot(np.dot(K, L), K)), np.dot(np.dot(K, H), K)) # Eigenvector decomposition as solution to trace minimization _, C = eigs(J, k=self.num_components) # Discard imaginary numbers (possible computation issue) return np.real(C), K def fit(self, X, y, Z): """ Fit/train a classifier on data mapped onto transfer components. INPUT (1) array 'X': source data (N samples by D features) (2) array 'y': source labels (N samples by 1) (3) array 'Z': target data (M samples by D features) OUTPUT """ # Data shapes N, DX = X.shape M, DZ = Z.shape # Assert equivalent dimensionalities assert DX == DZ # Assert correct number of components for given dataset assert self.num_components <= N + M - 1 # Maintain source and target data for later kernel computations self.XZ = np.concatenate((X, Z), axis=0) # Transfer component analysis self.C, K = self.transfer_component_analysis(X, Z) # Map source data onto transfer components X = np.dot(K[:N, :], self.C) print("self.loss:", str(self.loss)) # Train a weighted classifier if self.loss == 'logistic': # Logistic regression model with sample weights self.clf.fit(X, y) elif self.loss == 'quadratic': # Least-squares model with sample weights self.clf.fit(X, y) elif self.loss == 'hinge': # Linear support vector machine with sample weights self.clf.fit(X, y) elif self.loss == 'dtree': # DecisionTreeClassifier self.clf.fit(X, y) elif self.loss == 'berno': # BernoulliNB self.clf.fit(X, y) else: # Other loss functions are not implemented raise NotImplementedError # Mark classifier as trained self.is_trained = True # Store training data dimensionality self.train_data_dim = DX def predict(self, Z_): """ Make predictions on new dataset. INPUT (1) array 'Z_': new data set (M samples by D features) OUTPUT (2) array 'preds': label predictions (M samples by 1) """ # Data shape M, D = Z_.shape # If classifier is trained, check for same dimensionality if self.is_trained: assert self.train_data_dim == D # Compute kernel for new data K = self.kernel(Z_, self.XZ, type=self.kernel_type, bandwidth=self.bandwidth, order=self.order) # Map new data onto transfer components Z_ = np.dot(K, self.C) # Call scikit's predict function preds = self.clf.predict(Z_) # For quadratic loss function, correct predictions if self.loss == 'quadratic': preds = (np.sign(preds)+1)/2. # Return predictions array return preds def get_params(self): """Get classifier parameters.""" return self.clf.get_params() def is_trained(self): """Check whether classifier is trained.""" return self.is_trained # add. by pb def predict_proba(self, Z_, whiten=False): """ Make predictions on new dataset. INPUT (1) array 'Z_': new data set (M samples by D features) OUTPUT (2) array 'preds': label predictions (M samples by 1) """ # Data shape M, D = Z_.shape # If classifier is trained, check for same dimensionality if self.is_trained: assert self.train_data_dim == D # Compute kernel for new data K = self.kernel(Z_, self.XZ, type=self.kernel_type, bandwidth=self.bandwidth, order=self.order) # Map new data onto transfer components Z_ = np.dot(K, self.C) # Call scikit's predict function preds = self.clf.predict_proba(Z_) # For quadratic loss function, correct predictions if self.loss == 'quadratic': preds = (np.sign(preds)+1)/2. # Return predictions array return preds
arrTrainLabels = arrTrain[:,1:2].astype(int) # print(arrTrainLabels) # arrTrainFeatures = arrTrain[:,[2,3,4,5]] arrTrainFeatures = arrTrain[:,[2,3,4,5,6,7,8,9,10,11,12]] # print(arrTrainFeatures) model = BernoulliNB( alpha=1.0 , binarize=0.0 , class_prior=None , fit_prior=True ).fit(arrTrainFeatures, arrTrainLabels) print(model) print(model.get_params()) arrTest = dfTest.as_matrix().astype(int) arrTestLabels = arrTest[0:100,1] # arrTestFeatures = arrTest[0:10000,[2,3,4,5]] arrTestFeatures = arrTest[0:100,[2,3,4,5,6,7,8,9,10,11,12]] pred_proba = model.predict_proba(arrTestFeatures) print(pred_proba) pred = model.predict(arrTestFeatures) print(pred) for i in range(len(arrTestLabels)): print(arrTestLabels[i:i+1,][0],"-", pred[i,]) pred_score = model.score(arrTestFeatures, arrTestLabels)
cm = confusion_matrix(Yset_test, y_pred) Accuracy = ((cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])) * 100 Precision = ((cm[0, 0]) / (cm[0, 0] + cm[1, 0])) * 100 Recall = ((cm[0, 0]) / (cm[0, 0] + cm[0, 1])) * 100 print("Confusion Matrix: \n%s " % (cm)) print("Accuracy: %.2f%% ; Precision: %.2f%% ; Recall: %.2f%%" % (Accuracy, Precision, Recall)) plot_confusion_matrix( Yset_test, y_pred, classes=class_names, title='Confusion matrix, Bernoulli NB important features') bnb.get_params() rndF = RandomForestClassifier() y_pred = rndF.fit(Xset_train, Yset_train).predict(Xset_test) print( "RandomForestClassifier: Number of mislabeled points out of a total %d points : %d" % (Xset_train.shape[0], (Yset_test != y_pred).sum())) cm = confusion_matrix(Yset_test, y_pred) Accuracy = ((cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])) * 100 Precision = ((cm[0, 0]) / (cm[0, 0] + cm[1, 0])) * 100 Recall = ((cm[0, 0]) / (cm[0, 0] + cm[0, 1])) * 100 print("Confusion Matrix: \n%s " % (cm)) print("Accuracy: %.2f%% ; Precision: %.2f%% ; Recall: %.2f%%" % (Accuracy, Precision, Recall))
import numpy as np from sklearn.naive_bayes import BernoulliNB X = np.array([[1, 2, 3, 4], [1, 3, 4, 4], [2, 4, 5, 5]]) y = np.array([1, 1, 2]) clf = BernoulliNB(alpha=1, class_prior=None, binarize=2.0, fit_prior=False) clf.fit(X, y, sample_weight=None) #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组 print(clf.class_log_prior_) print(X) #class_log_prior_:各类标记的平滑先验概率对数值,其取值会受fit_prior和class_prior参数的影响,三种情况 #若指定了class_prior参数,不管fit_prior为True或False,class_log_prior_取值是class_prior转换成log后的结果 #若fit_prior参数为False,class_prior=None,则各类标记的先验概率相同等于类标记总个数N分之一 #若fit_prior参数为True,class_prior=None,则各类标记的先验概率相同等于各类标记个数除以各类标记个数之和 print(clf.class_count_) #class_count_属性:获取各类标记对应的训练样本数 print(clf.feature_count_) #:各类别各个特征出现的次数,返回形状为(n_classes, n_features)数组) print(clf.get_params(deep=True)) #get_params(deep=True):返回priors与其参数值组成字典 print(clf.predict_log_proba([[3, 4, 5, 4], [1, 3, 5, 6] ])) #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值 print(clf.predict_proba([[3, 4, 5, 4], [1, 3, 5, 6]])) #predict_proba(X):输出测试样本在各个类标记预测概率值 print(clf.score([[3, 4, 5, 4], [1, 3, 5, 6]], [1, 1])) #score(X, y, sample_weight=None):输出对测试样本的预测准确率的平均值 clf.set_params(alpha=2.0) #set_params(**params):设置估计器参数 print(clf.get_params(deep=True))