def test_ovo_partial_fit_predict(): X, y = shuffle(iris.data, iris.target) ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches don't have all target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovo1.partial_fit(iris.data[60:], iris.target[60:]) pred1 = ovo1.predict(iris.data) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65)
def test_ovo_partial_fit_predict(): temp = datasets.load_iris() X, y = temp.data, temp.target ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches have binary target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:60], y[:60], np.unique(y)) ovo1.partial_fit(X[60:], y[60:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred1), 0.65) ovo = OneVsOneClassifier(MultinomialNB()) X = np.random.rand(14, 2) y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2] ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4]) ovo.partial_fit(X[7:], y[7:]) pred = ovo.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) # raises error when mini-batch does not have classes from all_classes ovo = OneVsOneClassifier(MultinomialNB()) error_y = [0, 1, 2, 3, 4, 5, 2] message_re = escape("Mini-batch contains {0} while " "it must be subset of {1}".format(np.unique(error_y), np.unique(y))) assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7], error_y, np.unique(y)) # test partial_fit only exists if estimator has it: ovr = OneVsOneClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_ovo_string_y(): # Test that the OvO doesn't mess up the encoding of string labels X = np.eye(4) y = np.array(['a', 'b', 'c', 'd']) ovo = OneVsOneClassifier(LinearSVC()) ovo.fit(X, y) assert_array_equal(y, ovo.predict(X))
def test_ovo_string_y(): "Test that the OvO doesn't screw the encoding of string labels" X = np.eye(4) y = np.array(['a', 'b', 'c', 'd']) svc = LinearSVC() ovo = OneVsOneClassifier(svc) ovo.fit(X, y) assert_array_equal(y, ovo.predict(X))
def OneVsOne(inputs_train, inputs_valid, target_train, target_valid): name = "Multiclass One Vs One" clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(inputs_train, np.ravel(target_train)) prediction = clf.predict(inputs_valid) correct = np.count_nonzero(np.ravel(target_valid) == prediction) total = target_valid.shape[0] correctRate = (float(correct)/total)*100 return name, correctRate
def multiclassSVC(classifier, sz=2000): mnsize = sz df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test) print 'Beginning analysis: {}'.format(X.shape) #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y) clf = OneVsOneClassifier(classifier).fit(X, y) #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y) y_pred = clf.predict(X) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test)))) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def test_ovo_decision_function(): n_samples = iris.data.shape[0] ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0)) # first binary ovo_clf.fit(iris.data, iris.target == 0) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples,)) # then multi-class ovo_clf.fit(iris.data, iris.target) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples, n_classes)) assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data)) # Compute the votes votes = np.zeros((n_samples, n_classes)) k = 0 for i in range(n_classes): for j in range(i + 1, n_classes): pred = ovo_clf.estimators_[k].predict(iris.data) votes[pred == 0, i] += 1 votes[pred == 1, j] += 1 k += 1 # Extract votes and verify assert_array_equal(votes, np.round(decisions)) for class_idx in range(n_classes): # For each sample and each class, there only 3 possible vote levels # because they are only 3 distinct class pairs thus 3 distinct # binary classifiers. # Therefore, sorting predictions based on votes would yield # mostly tied predictions: assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.]))) # The OVO decision function on the other hand is able to resolve # most of the ties on this data as it combines both the vote counts # and the aggregated confidence levels of the binary classifiers # to compute the aggregate decision function. The iris dataset # has 150 samples with a couple of duplicates. The OvO decisions # can resolve most of the ties: assert_greater(len(np.unique(decisions[:, class_idx])), 146)
#Predict the probability of a classifier clf.predict_proba([[5, 1.5]]) #Predict the class of a classifier clf.predict([[5, 1.5]]) ############################################################################## #Classifier One vs One and One vs Rest ############################################################################## #Force ScikitLearn to use One-vs-One from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(X_train, y_train) ovo_clf.predict([some_digit]) #Force ScikitLearn to use One-vs-All from sklearn.multiclass import OneVsRestClassifier ovr_clf = OneVsRestClassifier(SVC(gamma="auto", random_state=42)) ovr_clf.fit(X_train[:1000], y_train[:1000]) ovr_clf.predict([some_digit]) #If you used a random classifier, you would get 10% #accuracy, so this is not such a bad score, but you can still do much better. For #example, simply scaling the inputs ############################################################################## #SVM ##############################################################################
#!/bin/env python import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsOneClassifier from msmbuilder.io import load_meta, load_trajs import sys from sklearn.externals import joblib depth = 9 meta, all_data = load_trajs('alpha_carbon/') meta, all_label = load_trajs('macro-mapping/') all_data_one = np.concatenate(list(all_data.values())) all_label_one = np.concatenate(list(all_label.values())) clf = OneVsOneClassifier( RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=0)) clf.fit(all_data_one, all_label_one) print(' Depth %d Train Accu: %.3f' % (depth, np.sum(clf.predict(all_data_one) == all_label_one) / len(all_label_one))) ## save model joblib.dump(clf, 'ovo-randomforest/final_es100_' + str(depth) + ".pkl")
def test_ovo_exceptions(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) with pytest.raises(NotFittedError): ovo.predict([])
def baseline(path,outpath,n=3,ft=5,classifier='OneVsRest'): start_time = time.time() # Reading information about the collection infocollection = path+os.sep+'collection-info.json' problems = [] language = [] with open(infocollection, 'r') as f: for attrib in json.load(f): problems.append(attrib['problem-name']) language.append(attrib['language']) for index,problem in enumerate(problems): print(problem) # Reading information about the problem infoproblem = path+os.sep+problem+os.sep+'problem-info.json' candidates = [] with open(infoproblem, 'r') as f: fj = json.load(f) unk_folder = fj['unknown-folder'] for attrib in fj['candidate-authors']: candidates.append(attrib['author-name']) # Building training set train_docs=[] for candidate in candidates: train_docs.extend(read_files(path+os.sep+problem,candidate)) train_texts = [text for i,(text,label) in enumerate(train_docs)] train_labels = [label for i,(text,label) in enumerate(train_docs)] vocabulary = extract_vocabulary(train_docs,n,ft) vectorizer = CountVectorizer(analyzer='char',ngram_range=(n,n),lowercase=False,vocabulary=vocabulary) train_data = vectorizer.fit_transform(train_texts) train_data = train_data.astype(float) for i,v in enumerate(train_texts): train_data[i]=train_data[i]/len(train_texts[i]) print('\t', 'language: ', language[index]) print('\t', len(candidates), 'candidate authors') print('\t', len(train_texts), 'known texts') print('\t', 'vocabulary size:', len(vocabulary)) # Building test set test_docs=read_files(path+os.sep+problem,unk_folder) test_texts = [text for i,(text,label) in enumerate(test_docs)] test_data = vectorizer.transform(test_texts) test_data = test_data.astype(float) for i,v in enumerate(test_texts): test_data[i]=test_data[i]/len(test_texts[i]) print('\t', len(test_texts), 'unknown texts') # Applying SVM max_abs_scaler = preprocessing.MaxAbsScaler() scaled_train_data = max_abs_scaler.fit_transform(train_data) scaled_test_data = max_abs_scaler.transform(test_data) if classifier=='OneVsOne': clf=OneVsOneClassifier(LinearSVC(C=1)).fit(scaled_train_data, train_labels) else: clf=OneVsRestClassifier(LinearSVC(C=1)).fit(scaled_train_data, train_labels) predictions=clf.predict(scaled_test_data) # Writing output file out_data=[] unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt') pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep) for i,v in enumerate(predictions): out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v}) with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f: json.dump(out_data, f, indent=4) print('\t', 'answers saved to file','answers-'+problem+'.json') print('elapsed time:', time.time() - start_time)
class RVC(BaseRVM, ClassifierMixin): """Relevance Vector Machine Classification. Implementation of Mike Tipping's Relevance Vector Machine for classification using the scikit-learn API. """ def __init__(self, n_iter_posterior=50, **kwargs): """Copy params to object properties, no validation.""" self.n_iter_posterior = n_iter_posterior super(RVC, self).__init__(**kwargs) def get_params(self, deep=True): """Return parameters as a dictionary.""" params = super(RVC, self).get_params(deep=deep) params['n_iter_posterior'] = self.n_iter_posterior return params def _classify(self, m, phi): return expit(np.dot(phi, m)) def _log_posterior(self, m, alpha, phi, t): y = self._classify(m, phi) log_p = -1 * (np.sum(np.log(y[t == 1]), 0) + np.sum(np.log(1-y[t == 0]), 0)) log_p = log_p + 0.5*np.dot(m.T, np.dot(np.diag(alpha), m)) jacobian = np.dot(np.diag(alpha), m) - np.dot(phi.T, (t-y)) return log_p, jacobian def _hessian(self, m, alpha, phi, t): y = self._classify(m, phi) B = np.diag(y*(1-y)) return np.diag(alpha) + np.dot(phi.T, np.dot(B, phi)) def _posterior(self): result = minimize( fun=self._log_posterior, hess=self._hessian, x0=self.m_, args=(self.alpha_, self.phi, self.t), method='Newton-CG', jac=True, options={ 'maxiter': self.n_iter_posterior } ) self.m_ = result.x self.sigma_ = np.linalg.inv( self._hessian(self.m_, self.alpha_, self.phi, self.t) ) def fit(self, X, y): """Check target values and fit model.""" self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need 2 or more classes.") elif n_classes == 2: self.t = np.zeros(y.shape) self.t[y == self.classes_[1]] = 1 return super(RVC, self).fit(X, self.t) else: self.multi_ = None self.multi_ = OneVsOneClassifier(self) self.multi_.fit(X, y) return self def predict_proba(self, X): """Return an array of class probabilities.""" phi = self._apply_kernel(X, self.relevance_) y = self._classify(self.m_, phi) return np.column_stack((1-y, y)) def predict(self, X): """Return an array of classes for each input.""" if len(self.classes_) == 2: y = self.predict_proba(X) res = np.empty(y.shape[0], dtype=self.classes_.dtype) res[y[:, 1] <= 0.5] = self.classes_[0] res[y[:, 1] >= 0.5] = self.classes_[1] return res else: return self.multi_.predict(X)
print(">>>> Loading finished") feature_vec = np.zeros((len(data), kmeans.n_clusters)) for i in range(len(data)): mydata = data[i] # mydata = pca.transform(mydata) feature_seq = kmeans.predict(mydata) for j in feature_seq: feature_vec[i][feature_seq[j]] += 1 feature_vec = normalize(feature_vec) train_x, test_x, train_y, test_y = \ train_test_split(feature_vec, all_y, test_size = 1-train_ratio) print(feature_vec.shape) print(">>>> Data prepared") # for alpha_ in [0.1, 0.01, 0.02, 0.03, 0.05, 0.008, 0.009, 0.006, 0.005]: for alpha_ in [0.0001]: clf = OneVsOneClassifier(linear_model.SGDClassifier(alpha = alpha_, n_iter=150000, shuffle=True), n_jobs=4) clf.fit(train_x, train_y) print(" alpha", alpha_) print(" train score", clf.score(train_x, train_y)) print(" test score", clf.score(test_x, test_y)) print(clf) pred_y = clf.predict(test_x) print(test_x[:2,:5]) print(pred_y)
# algorithms capable of handling multiple classes like Random Fores and Naive Bayes Classification # or you can use multiple binary classifiers (linear and SVM): # One Versus All strategy(OvA): train binary classifier for each class and you get the decision score from each binary classifier and choose the highest # One Versus One strategy(OvO): train binar classifier for every pair of class # Scikit-Learn detects when you try to use a binary classification algorithm for a multiclass classification task, and it automatically runs OvA sgd_clf.fit(X_train, y_train) # sgd_clf.predict([some_digit]) >>> array([5], dtype=uint8) some_digit_scores = sgd_clf.decision_function([some_digit]) # you can get the deicion scores for each class # if you want to use OvO classifier: from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(X_train, y_train) ovo_clf.predict([some_digit]) # array([5], dtype=uint8) # training the Random Forest Classifier forest_clf.fit(X_train, y_train) forest_clf.predict([some_digit]) # array([5], dtype=uint8) # to view the list of probabilities for each class forest_clf.predict_proba([some_digit]) # array([[0. , 0. , 0.01, 0.08, 0. , 0.9 , 0. , 0. , 0. , 0.01]]) # evaluate the accuracy usuing cross val score cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy") # array([0.8489802 , 0.87129356, 0.86988048]) # can increase accuracy by scaling the inputs from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") # array([0.89707059, 0.8960948 , 0.90693604])
train_counts = vectorizer.fit_transform(train_dataset.data) test_counts = vectorizer.transform(test_dataset.data) X_train_tfidf = tfidf_trans.fit_transform(train_counts) X_test_tfidf = tfidf_trans.transform(test_counts) svd = TruncatedSVD(n_components=50) X_train = svd.fit_transform(X_train_tfidf) y_train = train_dataset.target X_test = svd.transform(X_test_tfidf) y_test = test_dataset.target # One Vs One svm_ovo = OneVsOneClassifier(SVC(kernel='linear')).fit(X_train, y_train) ovo_train_pred = svm_ovo.predict(X_train) ovo_test_pred = svm_ovo.predict(X_test) print("SVM One Vs One Multiclass Classifier") print("--- train dataset ---") print_pred_info(y_train, ovo_train_pred) print("--- test dataset ---") print_pred_info(y_test, ovo_test_pred) # One Vs Rest svm_ovr = OneVsRestClassifier(SVC(kernel='linear')).fit(X_train, y_train) ovr_train_pred = svm_ovr.predict(X_train) ovr_test_pred = svm_ovr.predict(X_test) print("SVM One Vs Rest Multiclass Classifier") print("--- train dataset ---") print_pred_info(y_train, ovr_train_pred) print("--- test dataset ---")
] print 'OvR Random Forest:' print( classification_report(y_test, y_predicted_OvR_rdmf, target_names=target_names)) print '' print 'OvR Random Forest confusion matrix:' print confusion_matrix(y_test, y_predicted_OvR_rdmf) #One versus one random forest OvO_forest = OneVsOneClassifier(forest_classification) OvO_forest.fit(X_train, y_train) y_predicted_OvO_rdmf = OvO_forest.predict(X_test) print '' print 'OvO Random Forest classification accuracy score = %f ' % accuracy_score( y_test, y_predicted_OvO_rdmf) print '' target_names = [ le.inverse_transform([0])[0], le.inverse_transform([1])[0], le.inverse_transform([2])[0], le.inverse_transform([3])[0] ] print 'OvO Random Forest:' print( classification_report(y_test,
def crosstemp_decoding_subspace_xval_ridge(X, y, indtrain, indtest, alpha=1, indsub=None, mask=None): '''Cross-temporal decoding with cross-validation in subspace Parameters ---------- X : np.array<trials * bins * neurons> - data y : np.array<trials> - targets trainind : np.array - the indices of the training trials testind : np.array - the indices of the testing trials alpha : float - the L2 "ridge" regularization parameter subind : np.array - the indices of the trials used to define the subspace. They must be different from the training and testing indices. mask : np.array<nbins> of bool - a mask to select which bins are used to define the subspace. E.g.: np.array([False, False, True, True, False]) here only bins 2 and 3 are used to define the subspace. Returns ------- correct : np.array<bins * bins> of Boolean's True if the output is correct, False otherwise testout : np.array<bins * bins * test trials> The output of the classifier for each pair of train and test bins ''' assert len(set(indtrain) & set(indtest)) == 0 if indsub is not None: subspace = True assert len(set(indtrain) & set(indsub)) == 0 assert len(set(indtest) & set(indsub)) == 0 else: subspace = False nbins = X.shape[1] labels = np.unique(y) testout = np.empty((len(indtest), nbins, nbins)) correct = np.empty((nbins, nbins)) Xtrain, Xtest = X[indtrain], X[indtest] ytrain, ytest = y[indtrain], y[indtest] ### Split subspace and training if subspace indices are provided if subspace: if mask is None: mask = range(nbins) ysub = y[indsub] Xsub = X[:, mask][indsub].mean(1) # Averaging over time bins Xsub = np.stack([Xsub[ysub == label].mean(0) for label in labels]) subspace = PCA() subspace.fit(Xsub) ### A decoder is trained on each bin, and each decoder is tested on every bins for itrain in range(nbins): if subspace: Xbintrain = subspace.transform(Xtrain[:, itrain]) else: Xbintrain = Xtrain[:, itrain] model = OneVsOneClassifier( RidgeClassifier(alpha=alpha, solver='cholesky')) model.fit(Xbintrain, ytrain) # The test data is reshaped to test all the bins in a single shot (much faster) Xtest_ = Xtest.reshape(Xtest.shape[0] * Xtest.shape[1], Xtest.shape[2]) if subspace: Xtest_ = subspace.transform(Xtest_) preds = model.predict(Xtest_) # The output is reshaped to the original shape of the test data preds = preds.reshape(len(indtest), nbins) accs = (preds == ytest[:, None]).mean(0) testout[:, itrain, :] = preds correct[itrain, :] = accs return correct, testout
##############################OVO######################################################### model = SVC(decision_function_shape='ovo') # fit model model.fit(X, y) # make predictions yhat = model.predict(X) # define model model = SVC() # define ovo strategy ovo = OneVsOneClassifier(model) # fit model ovo.fit(X, y) # make predictions y_pred = ovo.predict(X) ######################################################################################################### #Unire 1 a 2 e formare un unico pezzo e provare l'algoritmo binario###################################### ######################################################################################################### training['VAR_CLASS'] = training['VAR_CLASS'].replace(2, 1) #Adesso il problema diventa binario ed è così possibile usare gli algoritmi più noti training['VAR_CLASS'] x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nr = NearMiss() X_train, y_train = nr.fit_sample(x_train, y_train)
def decoding_subspace_xval_ridge(X, y, trainind, testind, alpha=1, subspace=True, mask=None): '''Decoding with cross-validation in subspace Parameters ---------- X : list of np.arrays [ntrials] np.array<nbins*nneurons> y : list of np.arrays [ntrials] np.array<nbins> trainind : np.array The indices of the training trials testind : np.array The indices of the testing trials Returns ------- correct : np.array<nbins*nbins> of Boolean's True if the output is correct, False otherwise testout : np.array<nbins*nbins*ntesttrials> The output of the classifier for each pair of train and test bins ''' sub_train_ratio = .5 nbins = X.shape[1] labels = np.unique(y) testout = np.empty((len(testind), nbins)) correct = np.empty(nbins) if mask is None: mask = range(nbins) ### Split subspace and training if subspace: nsubtrials = int(len(trainind) * sub_train_ratio) subind, trainind = trainind[:nsubtrials], trainind[nsubtrials:] ysub = y[subind] Xsub = X[:, mask][subind].mean(1) # Averaging over time bins Xsub = np.stack([Xsub[ysub == label].mean(0) for label in labels]) subspace = PCA() subspace.fit(Xsub) Xtrain, Xtest = X[trainind], X[testind] ytrain, ytest = y[trainind], y[testind] for ibin in range(nbins): if subspace: Xbintrain = subspace.transform(Xtrain[:, ibin]) else: Xbintrain = Xtrain[:, ibin] model = OneVsOneClassifier( RidgeClassifier(alpha=alpha, solver='cholesky')) model.fit(Xbintrain, ytrain) if subspace: Xbintest = subspace.transform(Xtest[:, ibin]) else: Xbintest = Xtest[:, ibin] out = model.predict(Xbintest) testout[:, ibin] = out correct[ibin] = np.mean(out == ytest) return correct, testout
__author__ = 'BfireLai' __mtime__ = '2018/7/25' """ from sklearn import datasets from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import LinearSVC #加载数据 iris = datasets.load_iris() #获取x,y x, y = iris.data, iris.target print('样本数量:%d,特征数量:%d' % x.shape) print(y) #模型构建 clf = OneVsOneClassifier(LinearSVC(random_state=0)) #模型训练 clf.fit(x, y) #输出预测结果值 print(clf.predict(x)) #模型属性输出 k = 1 for item in clf.estimators_: print("第%d个模型" % k) print(item) k += 1 print(clf.classes_)
def main(): """Main function""" def show_img(digit): """Plots on digit on the screen""" img = digit.reshape(28, 28) plt.imshow(img, cmap=matplotlib.cm.binary, interpolation='nearest') plt.axis("off") plt.show() # def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): # """Plots precision and recall vs threshold""" # plt.plot(thresholds, precisions[:-1], "b--", label="Precision") # plt.plot(thresholds, recalls[:-1], "g--", label="Recall") # plt.xlabel("Threshold") # plt.legend(loc="upper left") # plt.ylim([0, 1]) # plt.xlim([0, 1]) # def plot_precision_vs_recall(precisions, recalls): # """Plots precision vs recall""" # plt.plot(recalls, precisions) # plt.ylim([0, 1]) # plt.xlim([0, 1]) # plt.xlabel("Recall") # plt.ylabel("Precision") def plot_roc_curve(fpr, tpr, label=None): """Plots ROC curve""" plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') plt.axis([0, 1, 0, 1]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") def plot_digits(instances, images_per_row=10, **options): """Plots groups of digits""" size = 28 images_per_row = min(len(instances), images_per_row) images = [instance.reshape(size, size) for instance in instances] n_rows = (len(instances) - 1) // images_per_row + 1 row_images = [] n_empty = n_rows * images_per_row - len(instances) images.append(np.zeros((size, size * n_empty))) for row in range(n_rows): rimages = images[row * images_per_row : (row + 1) * images_per_row] row_images.append(np.concatenate(rimages, axis=1)) image = np.concatenate(row_images, axis=0) plt.imshow(image, cmap=matplotlib.cm.binary, **options) plt.axis("off") # fetching and segmenting the dataset x_init, y_init = fetch_openml('mnist_784', return_X_y=True) x_train = x_init[:60000] x_test = x_init[60000:] y_train = y_init[:60000] # y_test = y_init[60000:] # shuffling the inputs shuffled_index = np.random.permutation(60000) x_train = x_train[shuffled_index] y_train = y_train[shuffled_index] # testing a binary classifier # convert the categories ('1','2',...,'9','0') to true ('5') or false y_train_5 = (y_train == '5') # y_test_5 = (y_test == '5') # creating and training a classifier sgd_clf = SGDClassifier(random_state=42) sgd_clf.fit(x_train, y_train_5) # scoring the classifier print(cross_val_score(sgd_clf, x_train, y_train_5, cv=3, scoring="accuracy")) y_pred = cross_val_predict(sgd_clf, x_train, y_train_5) # plotting results in confusion matrix print(confusion_matrix(y_train_5, y_pred)) # getting precision and recall precision = precision_score(y_train_5, y_pred) # how many correctly classified recall = recall_score(y_train_5, y_pred) # how many positives were detected f1_ = f1_score(y_train_5, y_pred) print(precision, recall, f1_) # getting threshold from clasifier instead of the prediction print(sgd_clf.decision_function(x_init[10000].reshape(1, -1))) # get score for class # customizing threshold to tune recall and precison y_scores = cross_val_predict(sgd_clf, x_train, y_train_5, method="decision_function") precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) print(precisions, recalls, thresholds) y_train_5_ht = y_scores > -200000 print(precision_score(y_train_5, y_train_5_ht), recall_score(y_train_5, y_train_5_ht)) # ROC curve fpr, tpr, thresholds, = roc_curve(y_train_5, y_scores) plot_roc_curve(fpr, tpr) print(roc_auc_score(y_train_5, y_scores)) # Comparison between SGD and RandomForest classifiers using ROC metrics forest_clf = RandomForestClassifier(random_state=42) y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_5, cv=3, method="predict_proba") y_scores_forest = y_probas_forest[:, 1] # get positive class probability fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest) print(fpr_forest, tpr_forest, thresholds_forest) # plot comparison between SGD and RandomForests plt.plot(fpr, tpr, "b:", label="SGD") plot_roc_curve(fpr_forest, tpr_forest, "Random Forest") plt.legend(loc="lower right") plt.show() print(roc_auc_score(y_train_5, y_scores), roc_auc_score(y_train_5, y_scores_forest)) # checking recall and scores of the new model y_pred_forest = y_scores_forest > 0.5 precision_forest = precision_score(y_train_5, y_pred_forest) recall_forest = recall_score(y_train_5, y_pred_forest) f1_forest = f1_score(y_train_5, y_pred_forest) print(precision_forest, recall_forest, f1_forest) # MULTICLASS Classification # using regular classifiers (which use OvO or OvR strategies) sgd_clf.fit(x_train, y_train) some_digit_scores = sgd_clf.decision_function(x_init[1000].reshape(1, -1)) # '0' np.argmax(some_digit_scores) # returns the index with max element print(sgd_clf.classes_) # compare it to the classes # force classifiers to use OvO or OvR ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(x_train, y_train) ovo_clf.predict(x_init[1000].reshape(1, -1)) len(ovo_clf.estimators_) # get how many classifiers were trained # Using RandomForests, which is already a multiclass classifier natively forest_clf.fit(x_train, y_train) forest_clf.predict(x_init[1000].reshape(1, -1)) forest_clf.predict_proba(x_init[1000].reshape(1, -1)) print(forest_clf.classes_) # Improving results using a Scaler (why does it work since they are images) scaler = StandardScaler() x_train_scaled = scaler.fit_transform(x_train.astype(np.float64)) cross_val_score(sgd_clf, x_train_scaled, y_train, cv=5, scoring="accuracy") # Error Analysis sgd_clf.fit(x_train_scaled, y_train) y_pred = cross_val_predict(sgd_clf, x_train_scaled, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_pred) # Make the confusion matrix graphical plt.matshow(conf_mx, cmap=plt.cm.gray) plt.show() # Convert from absolute values to percentages row_sums = conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = conf_mx / row_sums # review python vector operations # Plot only the errors np.fill_diagonal(norm_conf_mx, 0) # numpy has some obscure functions! plt.matshow(norm_conf_mx, cmap=plt.cm.gray) # Plotting individual errors cl_a, cl_b = '3', '5' x_aa = x_train[(y_train == cl_a) & (y_pred == cl_a)] x_ab = x_train[(y_train == cl_a) & (y_pred == cl_b)] x_ba = x_train[(y_train == cl_b) & (y_pred == cl_a)] x_bb = x_train[(y_train == cl_b) & (y_pred == cl_b)] plt.figure(figsize=(8, 8)) plt.subplot(221) plot_digits(x_aa[:25], images_per_row=5) plt.subplot(222) plot_digits(x_ab[:25], images_per_row=5) plt.subplot(223) plot_digits(x_ba[:25], images_per_row=5) plt.subplot(224) plot_digits(x_bb[:25], images_per_row=5) plt.show() # Multilabel Classification y_train_large = (y_train.astype(int) >= 7) y_train_odd = (y_train.astype(int) % 2 == 1) y_multilabel = np.c_[y_train_large, y_train_odd] # Multilabel Classifier KNeighbors knn_clf = KNeighborsClassifier() knn_clf.fit(x_train, y_multilabel) knn_clf.predict(x_init[1000].reshape(1, -1)) # Computing a metric # Note, using smaller sample due to the time it takes for knn y_train_knn_pred = cross_val_predict(knn_clf, x_train[:1000], y_multilabel[:1000], cv=3) f1_score(y_multilabel[:1000], y_train_knn_pred, average='macro') # Multioutput Multiclass Classification noise = np.random.randint(0, 100, (len(x_train), 784)) x_train_mod = x_train + noise noise = np.random.randint(0, 100, (len(x_test), 784)) x_test_mod = x_test + noise y_train_mod = x_train y_test_mod = x_test knn_clf.fit(x_train_mod, y_train_mod) # checking cleaning-up results img_id = 0 clean_digit = knn_clf.predict(x_test_mod[img_id].reshape(1, -1)) plt.figure(figsize=(8, 8)) plt.subplot(311) show_img(x_test_mod[img_id]) plt.subplot(312) show_img(y_test_mod[img_id]) plt.subplot(313) show_img(clean_digit) plt.show()
clf = svc.fit(X_train, y_train) pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print(accuracy_score(pred_train, y_train)) print(accuracy_score(pred_test, y_test)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) f, ax, sc, txts = scatter(digits_proj, target) ax.contour(xx, yy, Z, c="k", linewidths=1.2) ##Naive Bayes## clf = OneVsOneClassifier(GaussianNB()) clf.fit(X_train, y_train) pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print(accuracy_score(pred_train, y_train)) print(accuracy_score(pred_test, y_test)) ##QDA## clf = OneVsRestClassifier(QuadraticDiscriminantAnalysis()) clf = OneVsOneClassifier(QuadraticDiscriminantAnalysis()) clf.fit(X_train, y_train) pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print(accuracy_score(pred_train, y_train)) print(accuracy_score(pred_test, y_test)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
y = [] for k in keys: for x in data[k]: X.append(x) y.append(k[-3:]) # HACK # y.append(k) return np.array(X, np.float32), np.array(y, np.int32) if __name__ == '__main__': X, y = load_data(sys.argv[1]) Xt, yt = load_data(sys.argv[2]) # === train === clf = OneVsOneClassifier( svm.LinearSVC( verbose=1, max_iter=10000, dual=False, ), 5) clf.fit(X, y) pickle.dump(clf, open('svm.pkl', 'wb')) # clf = pickle.load(open('svm.pkl', 'rb')) # === test === prediction = clf.predict(Xt) correct = prediction == yt print("\n========") print("Accuracy: {}".format(sum(correct) / len(correct)))
from sklearn.svm import LinearSVC from sklearn import metrics data_tr_r = np.loadtxt('multitest_out.csv', delimiter = ',') data_ts_r = np.loadtxt('multitrain_out.csv', delimiter = ',') data_tr = data_tr_r[:, :-1] data_ts = data_ts_r[:, :-1] label_tr = data_tr_r[:,-1] label_ts = data_ts_r[:,-1] # Learn to predict each class against one class clf = OneVsOneClassifier(LinearSVC(random_state = 0)) OvsO = clf.fit(data_tr, label_tr) result=clf.predict(data_ts) #accuracy=clf.score(data_ts,label_ts) accuracy = metrics.accuracy_score(result, label_ts) error_vector = result - label_ts error = 0 p_data= clf.fit(data_tr, label_tr).decision_function(data_ts) p_data=p_data[:,1] conf_mat=metrics.confusion_matrix(label_ts,result) precision=metrics.precision_score(label_ts, result, average = None) recall=metrics.recall_score(label_ts, result, average = None) print ("confusion_matrix:") print (conf_mat) print ("precision:", precision) print ("recall:", recall)
# In[32]: some_digit_scores = sgd_clf.decision_function([some_digit]) some_digit_scores # Note that the Scikit-Learn classifiers detect multiple classification issues and set OvA strategy by default. # # Another approach may be to train as many classifiers as different pairs of classes - each of them is responsible for predicting which class is more likely. The target class is selected as the one that has won the most such 'duels'. This strategy is called one-versus-one (OvO) # In[33]: from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(X_train, y_train) ovo_clf.predict([some_digit]) #Correct prediction! # Note that with this strategy, the number of classifiers increases squarely with the number of classes, so it is suitable for small and medium sets with a moderate number of classes (it is the default for support vector machines). # # As before, you can try to train a model of a random forest # In[34]: forest_clf.fit(X_train, y_train) forest_clf.predict([some_digit]) # In this case, it is more efficient to apply the OvA strategy (numerical results represent probabilities of belonging to a specific class) # In[35]: forest_clf.predict_proba([some_digit])
# Make an array of predictions on the test set pred = svm.predict(X_test) # Output the hitrate and the confusion matrix for each model print("SVM: ") print(svm.score(X_train, y_train)) print(svm.score(X_test, y_test)) #print(confusion_matrix(pred, y_test)) svm2 = OneVsOneClassifier(LinearSVC(C=100.)) svm2.fit(X_train, y_train) # Make an array of predictions on the test set pred = svm2.predict(X_test) # Output the hitrate and the confusion matrix for each model print("LinearSVC: ") print(svm2.score(X_train, y_train)) print(svm2.score(X_test, y_test)) from sklearn.neighbors import KNeighborsClassifier neigh = (KNeighborsClassifier(n_neighbors=2)) neigh.fit(X_train, y_train) pred = neigh.predict(X_test) print("knn: ") print(neigh.score(X_train,y_train)) print(neigh.score(X_test,y_test))
cls = sgd_clf.classes_[i] print("Index of maximum value in some_digit_scores:", i, "\nAll the classified classes:", sgd_classes, "\nClass to which it belongs:", cls) # When a classifier is trained, it stores the list of target classes in its classes_ attribute, ordered by value. # In[35]: # If you want to force Scikit-Learn to use one-versus-one or one-versus-all, you can use the OneVsOneClassifier or # OneVsRestClassifier classes. # For example, this code creates a multiclass classifier using OvO strategy, based on a SGD classifier: from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(X_train.reshape(60000, 784), y_train) ovo_clf.predict(some_digit_image.reshape(1, 784)) # len(ovo_clf.estimators_) Returns N_classes X (N_classes - 1) / 2 estimators used for predictions. # In[36]: # Training a RandomForestClassifier: forest_clf.fit(X_train.reshape(60000, 784), y_train) forest_clf.predict(some_digit_image.reshape(1, 784)) # In[37]: # This time Scikit-Learn did not have to run OvA or OvO because Random Forest classifiers can directly classify # instances into multiple classes. You can call predict_proba() to get list of probabilites that the classifier # assigned to each instance for each class: forest_clf.predict_proba(some_digit_image.reshape(1, 784))
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer: int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict: int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. multi_class: string, default: "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_: float The log-marginal-likelihood of ``self.kernel_.theta`` classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data .. versionadded:: 0.18 """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes. Only class %s present." % self.classes_[0]) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean( [estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel( [estimator.kernel_ for estimator in self.base_estimator_.estimators_]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelihood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean( [estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators)]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean( [estimator.log_marginal_likelihood( theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators)]) else: raise ValueError("Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
y_test = labelEncoder_y_test.fit_transform(y_test) # Feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # Fitting One-Vs-One SVM to the training set from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import LinearSVC clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) # Predicting the result y_pred = clf.predict(X_test) # Developing confusion matrix and classification report from sklearn.metrics import confusion_matrix, classification_report cm = confusion_matrix(y_test, y_pred) cr = classification_report(y_test, y_pred) print(cr) # Calculating FP and FPR TP0 = float(cm[0][0]) TP1 = float(cm[1][1]) TP2 = float(cm[2][2]) FN0 = float(cm[0][1]) + float(cm[0][2]) FN1 = float(cm[1][0]) + float(cm[1][2]) FN2 = float(cm[2][0]) + float(cm[2][1]) FP0 = float(cm[1][0]) + float(cm[2][0])
y = training['country_destination'] x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None) # Train classifier from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsOneClassifier clf = OneVsOneClassifier(RandomForestClassifier(n_estimators=50,n_jobs=5)) clf.fit(x_train,y_train) print( clf.feature_importances_ ); # Run Predictions from sklearn.metrics import confusion_matrix, accuracy_score y_preds = clf.predict(x_valid) print( confusion_matrix(y_valid,y_preds) ); print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f = open('randomForest_take2.txt', 'w') f.write( str(confusion_matrix(y_valid,y_preds)) ); f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f.write( "\nOneVsOneClassifier()" ); f.write( "\nclf = RandomForestClassifier(n_estimators=1000)" ); # Now on to final submission y_final = pd.DataFrame(clf.predict(testing.iloc[:,1:]).reshape([62096,])); numbahs = testing['id'] df = pd.concat([numbahs,y_final],axis=1) df.columns = ['id','country'] df.to_csv("randomForest_take2.csv",index=False)
print(test_ngrams.shape) parameters = { 'C': np.logspace(-10, 3, 20, endpoint=True, base=2.71828).tolist() } model = LinearSVC(penalty='l2', max_iter=10000, class_weight='balanced') clf = GridSearchCV(model, parameters, n_jobs=-1) clf.fit(test_ngrams, test_data_label) print('Optimal Lambda: ', clf.best_params_) classifier = OneVsOneClassifier(LinearSVC(C=clf.best_params_['C'], penalty='l2', max_iter=10000, class_weight='balanced'), n_jobs=-1).fit(train_ngrams, train_data_label) predictedValue = classifier.predict(test_ngrams) print(classifier.score(train_ngrams, train_data_label)) print(classifier.score(test_ngrams, test_data_label)) parameters = { 'C': np.logspace(-10, 3, 20, endpoint=True, base=2.71828).tolist(), 'kernel': ['linear', 'rbf'], 'decision_function_shape': ['ovo', 'ovr'] } model = SVC(class_weight='balanced', max_iter=10000) clf = GridSearchCV(model, parameters, n_jobs=-1) clf.fit(test_ngrams, test_data_label) print('Optimal Lambda: ', clf.best_params_) classifieRBF = SVC( kernel=clf.best_params_['kernel'],
print 79 * "_" print 'OvO', cv_scores_ovo.mean() print 'OvA', cv_scores_ova.mean() plt.figure(figsize=(4, 3)) plt.boxplot([cv_scores_ova, cv_scores_ovo]) plt.xticks([1, 2], ['One vs All', 'One vs One']) plt.title('Prediction: accuracy score') ### Plot a confusion matrix ################################################### # Fit on the the first 10 sessions and plot a confusion matrix on the # last 2 sessions from sklearn.metrics import confusion_matrix svc_ovo.fit(X[session < 10], y[session < 10]) y_pred_ovo = svc_ovo.predict(X[session >= 10]) plt.matshow(confusion_matrix(y_pred_ovo, y[session >= 10])) plt.title('Confusion matrix: One vs One') plt.xticks(np.arange(len(unique_conditions)), unique_conditions) plt.yticks(np.arange(len(unique_conditions)), unique_conditions) svc_ova.fit(X[session < 10], y[session < 10]) y_pred_ova = svc_ova.predict(X[session >= 10]) plt.matshow(confusion_matrix(y_pred_ova, y[session >= 10])) plt.title('Confusion matrix: One vs All') plt.xticks(np.arange(len(unique_conditions)), unique_conditions) plt.yticks(np.arange(len(unique_conditions)), unique_conditions) plt.show()
#Fill missing values imp = Imputer(missing_values='NaN',strategy='most_frequent',axis=0) vec_f_cat_train = imp.fit_transform(vec_f_cat_train) vec_f_cat_test = imp.transform(vec_f_cat_test) # complete x f_train = np.hstack((f_num_train, vec_f_cat_train )) f_test = np.hstack((f_num_test, vec_f_cat_test )) clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(f_train,t_train) y_test = clf.predict(f_test) y_train = clf.predict(f_train) y_test = le.inverse_transform(y_test) ofile = open(data_dir+'output.csv','w') writer = csv.writer(ofile) writer.writerow(['ISIN','Risk_Stripe']) for i in range(len(x_test)): writer.writerow([x_test[i],y_test[i]]) ofile.close()
class OneVSOneSVM(): """ One vs One Ensemble classifier with SVC as core """ def __init__(self): """ Initialization with following parameters: data: data matrix score: validation accuracy pred_y: prediction """ # self.data = np.genfromtxt("../data/processed_data.csv", delimiter = ',') self.data = np.genfromtxt("../data/processed_data_window.csv", delimiter=',') # self.data = np.genfromtxt("../data/processed_windowdata.csv", delimiter = ',') X = self.data[:, :-1] Y = self.data[:, -1] self.train_x, self.test_x, self.train_y, self.test_y = train_test_split( X, Y, test_size=0.33, random_state=42) self.clf = OneVsOneClassifier(SVC(kernel='linear')) self.SCORE = 0 self.pred_y = 0 def __str__(self): return "the score for ONeVSOneSVM is: " + str(self.score) def draw_confusion(self): """ print confusion matrix: x-axis is prediction, y_axis is ground truth """ #label = np.arange(11,-1,-1) label = np.arange(12) cnf_matrix = confusion_matrix(self.test_y, self.pred_y, labels=label) np.set_printoptions(precision=2) class_names = [ "walking-forward", "walking-left", "walking-right", "walking-upstairs", "walking-downstairs", "running", "jumping", "sitting", "standing", "sleeping", "elevator-up", "elevator-down" ] # print cnf_matrix self.plot_confusion_matrix( cnf_matrix, class_names, title='Confusion matrix, without normalization') def plot_confusion_matrix(self, cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ np.set_printoptions(precision=2) plt.figure(figsize=(8, 8)) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, np.round(cm[i, j], 2), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") im = plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) # plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=90) plt.yticks(tick_marks, classes) plt.colorbar(im, fraction=0.046, pad=0.04) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') def run(self): """ fit the model, print confusion matrix and score """ self.clf.fit(self.train_x, self.train_y) self.score = self.clf.score(self.test_x, self.test_y) self.pred_y = self.clf.predict(self.test_x) self.draw_confusion() print "the score for ONeVSOneSVM is: " + str(self.score)
class log_kernel_MOM(BaseEstimator): ''' Logistic Regression Kernel MOM Kernel logarithmic regression MOM risk minimization using IRLS with regularization L2 Parameters ---------- K : int, default 10 number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers. eta0 : float, default 1 step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i). beta : float, default 1 L2 regularization parameter. epoch : int, default 200 number of iterations before the end of the algorithm. kernel : {'rbf','poly', callable function}, default 'rbf' kernel used in the algorithm. A callable function can be given, it should take as entry two matrices X1, X2 and return the pairwise kernel distance matrix gamma : float, default 1/n_features coefficient used if the kernel is 'rbf' in which case the kernel function is exp(-gamma*x^2) degree : int, default 3 degree of the polynomial if the kernel is 'poly' agg : int, default 1 number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem. verbose : boolean, default True display a message at the end of each run if agg > 1. progress : boolean, default False display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar). compter : boolean, default False used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts". multi : {'ovr','ovo'} , default 'ovr' method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" . Attributes ---------- alpha : array like, length = n_sample alpha is updated in the algorithm, provides with the final coefficients of the decision function. counts : array like, length = n_sampled the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True. Methods ------- fit(X,y) : fit the model X : numpy matrix size = (n_samples,n_features) y : array like, length = n_samples predict(X) : predict the class of the points in X X : numpy matrix size = (n_samples,n_features) returns array-like, length = n_samples. predict_proba(X) : predict the probability that each point belong to each class. X : numpy matrox size = (n_samples,n_features) returns matrix, size = (n_samples,n_class) ''' def __init__(self, K=10, eta0=1, beta=1, epoch=200, kernel='rbf', gamma=None, degree=3, agg=1, verbose=True, progress=False, compter=False, multi='ovr', augmenter=1, power=2 / 3): args, _, _, values = inspect.getargvalues(inspect.currentframe()) values.pop("self") for arg, val in values.items(): setattr(self, arg, val) binary_clf = log_kernel_MOM_binary(K, eta0, beta, epoch, gamma, degree, agg, verbose, progress, compter, power) if multi == "ovr": self.clf = OneVsRestClassifier(binary_clf) elif multi == "ovo": self.clf = OneVsOneClassifier(binary_clf) else: raise NameError('Multiclass meta-algorithm not known') def fit(self, X, y): self.X = X perm = np.array([]) if (self.kernel == 'poly'): kfunc = lambda x, y: polynomial_kernel( x, y, degree=self.degree, gamma=self.gamma) elif (self.kernel == 'rbf'): kfunc = lambda x, y: rbf_kernel(x, y, self.gamma) else: kfunc = self.kernel Kernel = kfunc(np.array(X), np.array(X)) for f in range(self.augmenter): perm = np.hstack([perm, np.random.permutation(len(X))]) self.perm = perm.astype(np.int64) self.clf.fit(Kernel[self.perm][:, self.perm], y[self.perm]) return self def predict(self, xtest): if (self.kernel == 'poly'): kfunc = lambda x, y: polynomial_kernel( x, y, degree=self.degree, gamma=self.gamma) elif (self.kernel == 'rbf'): kfunc = lambda x, y: rbf_kernel(x, y, self.gamma) else: kfunc = self.kernel KC = kfunc(xtest, self.X[self.perm]) return self.clf.predict(KC) def predict_proba(xtest): if (self.kernel == 'poly'): kfunc = lambda x, y: polynomial_kernel( x, y, degree=self.degree, gamma=self.gamma) elif (self.kernel == 'rbf'): kfunc = lambda x, y: rbf_kernel(x, y, self.gamma) else: kfunc = self.kernel KC = kfunc(xtest, self.X[self.perm]) return self.clf.predict_proba(KC) def score(self, X, y): return np.mean(self.predict(X) == y) def set_params(self, **params): self.__init__(**params) return self
class RVC(BaseRVM, ClassifierMixin): """Relevance Vector Machine Classification. Implementation of Mike Tipping's Relevance Vector Machine for classification using the scikit-learn API. """ def __init__(self, n_iter_posterior=50, **kwargs): """Copy params to object properties, no validation.""" self.n_iter_posterior = n_iter_posterior super(RVC, self).__init__(**kwargs) def get_params(self, deep=True): """Return parameters as a dictionary.""" params = super(RVC, self).get_params(deep=deep) params['n_iter_posterior'] = self.n_iter_posterior return params def _classify(self, m, phi): return expit(np.dot(phi, m)) def _log_posterior(self, m, alpha, phi, t): y = self._classify(m, phi) log_p = -1 * (np.sum(np.log(y[t == 1]) + np.log(1-y[t == 0]), 0)) log_p = log_p + 0.5*np.dot(m.T, np.dot(np.diag(alpha), m)) jacobian = np.dot(np.diag(alpha), m) - np.dot(phi.T, (t-y)) return log_p, jacobian def _hessian(self, m, alpha, phi, t): y = self._classify(m, phi) B = np.diag(y*(1-y)) return np.diag(alpha) + np.dot(phi.T, np.dot(B, phi)) def _posterior(self): result = minimize( fun=self._log_posterior, hess=self._hessian, x0=self.m_, args=(self.alpha_, self.phi, self.t), method='Newton-CG', jac=True, options={ 'maxiter': self.n_iter_posterior } ) self.m_ = result.x self.sigma_ = np.linalg.inv( self._hessian(self.m_, self.alpha_, self.phi, self.t) ) def fit(self, X, y): """Check target values and fit model.""" self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need 2 or more classes.") elif n_classes == 2: self.t = np.zeros(y.shape) self.t[y == self.classes_[1]] = 1 return super(RVC, self).fit(X, self.t) else: self.multi_ = None self.multi_ = OneVsOneClassifier(self) self.multi_.fit(X, y) return self def predict_proba(self, X): """Return an array of class probabilities.""" phi = self._apply_kernel(X, self.relevance_) y = self._classify(self.m_, phi) return np.column_stack((1-y, y)) def predict(self, X): """Return an array of classes for each input.""" if len(self.classes_) == 2: y = self.predict_proba(X) res = np.empty(y.shape[0], dtype=self.classes_.dtype) res[y[:, 1] <= 0.5] = self.classes_[0] res[y[:, 1] >= 0.5] = self.classes_[1] return res else: return self.multi_.predict(X)
from sklearn.svm import LinearSVC from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier from numpy import * import datasets if not datasets.Quizbowl.loaded: datasets.loadQuizbowl() print '\n\nRUNNING ON EASY DATA\n' print 'training oaa' X = datasets.QuizbowlSmall.X Y = datasets.QuizbowlSmall.Y oaa = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, Y) print 'predicting oaa' oaaDevPred = oaa.predict(datasets.QuizbowlSmall.Xde) print 'error = %g' % mean(oaaDevPred != datasets.QuizbowlSmall.Yde) print 'training ava' ava = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y) print 'predicting ava' avaDevPred = ava.predict(datasets.QuizbowlSmall.Xde) print 'error = %g' % mean(avaDevPred != datasets.QuizbowlSmall.Yde) print '\n\nRUNNING ON HARD DATA\n' print 'training oaa' X = datasets.QuizbowlHardSmall.X Y = datasets.QuizbowlHardSmall.Y oaa = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, Y) print 'predicting oaa'
train_lab, test_lab = list(), list() mar = list() ps = list() pw = list() ax = ['spheroidite', 'network', 'pearlite', 'spheroidite+widmanstatten'] for a in ax: # print(b[4].shape) if a == 'spheroidite+widmanstatten': for d in range(0, 60): train5_fea.append(data_features[a][d][5]) train_lab.append(a) else: for d in range(0, 100): train5_fea.append(data_features[a][d][5]) train_lab.append(a) train5_fea = np.array(train5_fea).reshape(len(train5_fea), 512) lay5_ovo = OneVsOneClassifier(SVC()).fit(train5_fea, train_lab) for d in range(0, 36): mar.append(data_features_rest['martensite'][d][5]) mar = np.array(mar).reshape(len(mar), 512) mar_lab = lay5_ovo.predict(mar) for d in range(0, 107): ps.append(data_features_rest['pearlite+spheroidite'][d][5]) ps = np.array(ps).reshape(len(ps), 512) ps_lab = lay5_ovo.predict(ps) for d in range(0, 27): pw.append(data_features_rest['pearlite+widmanstatten'][d][5]) pw = np.array(pw).reshape(len(pw), 512) pw_lab = lay5_ovo.predict(pw)
score.append(multiclassifier.score(X_test, Y_test)) # svc = svm.SVC(kernel='linear', C=C).fit(X_train, Y_train) # rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X_train, Y_train) # poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X_train, Y_train) # print score # plt.figure("Score-order") # plt.plot(slips,score) # plt.show() #Use validate data to test the model. path = "./" label_dict = {"test":0} X_test,Y_test = allSamples(path,"test",label_dict,order,window,slip) #print X_test,Y_test print multiclassifier.predict(X_test) print multiclassifier.score(X_test,Y_test) #X_test = dim_reduction_PCA(X_test,0.99) #plot_data(X_test,Y_test,"PLOT",mirror=1) # y = pandas.Series(x) # plt.figure() # lag_plot(y,marker='+',color='gray') # plt.show() # autor = estimated_autocorrelation(x) # autor = autor[autor.size/2:] # # timestamp = 1order5order397880668 # date = datetime.datetime.fromtimestamp(timestamp/1e3) # datetime.datetime(2016, 2, 2, 8, 2order, order0, 668000)
X = X_encoded[:, :-1].astype(int) y = X_encoded[:, -1].astype(int) # Create SVM classifier classifier = OneVsOneClassifier(LinearSVC(random_state=0)) # Train the classifier classifier.fit(X, y) # Cross validation X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2, random_state=5) classifier = OneVsOneClassifier(LinearSVC(random_state=0)) classifier.fit(X_train, y_train) y_test_pred = classifier.predict(X_test) # Compute the F1 score of the SVM classifier f1 = cross_validation.cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3) print("F1 score: " + str(round(100 * f1.mean(), 2)) + '%') # Predict output for a test datapoint input_data = [ '37', 'Private', '215646', 'HS-grad', '9', 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40', 'United-States' ]
trans = LinearDiscriminantAnalysis(n_components=3) trans.fit(X,y) X = trans.transform(X) # Split Up Data x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None) # Train classifier from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsOneClassifier clf = OneVsOneClassifier(RandomForestClassifier(n_estimators=100,n_jobs=5)) clf.fit(x_train,y_train) # Run Predictions from sklearn.metrics import confusion_matrix, accuracy_score y_preds = clf.predict(x_valid) print( confusion_matrix(y_valid,y_preds) ); print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f = open('randomForest_take3.txt', 'w') f.write( str(confusion_matrix(y_valid,y_preds)) ); f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f.write( "\nQuadraticDiscriminantAnalysis()" ); f.write( "\nclf = RandomForestClassifier(n_estimators=1000)" ); # Now on to final submission x_final = testing.iloc[:,1:].values x_final = trans.transform(x_final) y_final = clf.predict(x_final).reshape([62096,]); y_final = pd.DataFrame(); numbahs = testing['id'] df = pd.concat([numbahs,y_final],axis=1)
class EveryWordOneFeature(object): def __init__(self, slack=1, gamma=1, kernelType='linear', gram=1): self.gram = gram self.slack = slack self.gamma = gamma self.kernelType = kernelType self.data = np.ones((1000, 1000)) self.cityClassifier = {} #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller? self.countryClassifier = OneVsOneClassifier( svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False, cache_size=1000)) self.bag = None self.numberOfFeatures = 0 #Features and labels self.fitting_data = None self.predict_data = None self.cityPrediction = {} self.countryPrediction = None self.numberOfCityFeatures = {} def fit_cities(self, trainingData, labels, countryCode): print "Start fitting cities for country " + str(countryCode) #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller? self.cityClassifier[countryCode] = OneVsOneClassifier( svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False)) start = time.time() self.cityClassifier[countryCode].fit(trainingData[:, :self.get_number_of_city_features(countryCode)], labels) end = time.time() print "Finished fitting cities in " + str((end - start)) + "s" def fit_countries(self): print "Start Fitting countries" start = time.time() self.countryClassifier.fit(self.fitting_data[:, :self.numberOfFeatures], self.fitting_data[:, (self.numberOfFeatures + 1)]) end = time.time() print "Finished fitting countries in " + str((end - start)) + "s" def preprocess_training_data(self, data): startOfPreprocessing = time.time() print "Start Preprocessing" lengthOfTrainingData = self.data.shape[0] print "length of trainingData = " + str(lengthOfTrainingData) self.bag = BagOfWords(data) self.fitting_data = self.bag.get_features_and_labels() self.numberOfFeatures = self.fitting_data.shape[1] - 2 startOfFittingCities = time.time() print "Finished Preprocessing in " + str((startOfFittingCities - startOfPreprocessing)) + "s" def fit(self, data): self.data = data self.preprocess_training_data(data) self.numberOfFeatures = self.fitting_data.shape[1] - 2 self.fit_countries() def predict_cities(self, data, countryCode): print "Start predict cities" start = time.time() print data[:, :self.numberOfFeatures] print self.cityPrediction self.cityPrediction[countryCode] = self.cityClassifier[countryCode].predict(data[:, :self.get_number_of_city_features(countryCode)]) end = time.time() print "Finished predicting cities in " + str((end - start)) + "s" def predict_countries(self): start = time.time() print "start predicting countries" print self.predict_data self.countryPrediction = self.countryClassifier.predict(self.predict_data[:, :self.numberOfFeatures]) end = time.time() print "finished predicting countries in " + str((end - start)) + "s" def preprocess_predict_data(self, predict): self.predict_data = self.bag.get_get_validation_features(predict) def get_city_featuers(self, data, countryCode): return np.zeros((data.shape[0], 3)) def predict(self, predict): self.preprocess_predict_data(predict) self.numberOfFeatures = self.predict_data.shape[1] # t1 = threading.Thread(target=self.predict_cities) self.predict_countries() joinedCityPredictions = np.zeros(predict.shape[0]) countryCodes = np.unique(self.data[:, 2].astype(int)) for countryCode in countryCodes: countryIndices = np.where(self.data[:, 2].astype(int) == countryCode)[0] self.fit_cities(self.get_city_featuers(self.data[countryIndices][:, 0],countryCode), self.data[countryIndices][:,1], countryCode) countryCodes = np.unique(self.countryPrediction) for countryCode in countryCodes: countryIndices = np.where(self.countryPrediction == countryCode)[0] self.predict_cities(self.get_city_featuers(predict[countryIndices], countryCode), countryCode) joinedCityPredictions[countryIndices] = self.cityPrediction[countryCode] prediction = np.vstack((joinedCityPredictions, self.countryPrediction)).T return prediction def get_number_of_city_features(self, cityCode): return 3
ts = np.genfromtxt(ts_path, delimiter=' ') tr_feat = tr[:,1:] ts_feat = ts[:,1:] tr_label = tr[:,0] ts_label = ts[:,0] # use sklearn C-Support Vector Classification ## == one-vs-one == ## # The multiclass support is handled in a one-vs-one scheme # train ovo_clf = OneVsOneClassifier(LinearSVC()) ovo_clf.fit(tr_feat, tr_label) # predict ovo_pred = ovo_clf.predict(ts_feat) ovo_err = 1- ovo_clf.score(ts_feat, ts_label) # confusion matrix # #array([[159, 7], # [ 5, 161]]) ovo_cmat = metrics.confusion_matrix(ts_label, ovo_pred) pred_total = np.sum(ovo_cmat,axis = 1) ovo_mis = 1- np.diag(ovo_cmat).astype(float) / pred_total print("one vs. one svm - classification err: %s \n"%(ovo_err)) print("confusion matrix: \n %s"%(ovo_cmat)) print("class misclassification rate : \n %s"%(ovo_mis)) ## == one-vs-rest == ## # The multiclass support is handled in a one-vs-rest scheme # train
citation_data = citation_data.loc[citation_data['paper_year'] >= 1996] citation_data.iloc[np.random.permutation(len(citation_data))] X = np.array(citation_data[features].values) #X = preprocessing.scale(X) y = (citation_data["paper_cat"].values.tolist()) return X,y test_X, test_y = Build_Data_Set() """ classifier_f = open("./workspace/rbfsvr.pickle", "rb") clf= pickle.load(classifier_f) classifier_f.close() """ print("Prediction started") prediction = clf.predict(test_X) from sklearn.metrics import confusion_matrix confusion_matrix(test_y, prediction) pearson = np.corrcoef(prediction, test_y) spearman = spearmanr(prediction, test_y) print("Results : rbf Kernel ") print(pearson) print(spearman) """
x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None) # Train classifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier clf = OneVsOneClassifier(GradientBoostingClassifier(n_estimators=50,verbose=100)) clf.fit(x_train,y_train) # Run Predictions from sklearn.metrics import confusion_matrix, accuracy_score y_preds = clf.predict(x_valid) print( confusion_matrix(y_valid,y_preds) ); print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f = open('OneVsOne_gradientBoost_take2.txt', 'w') f.write( str(confusion_matrix(y_valid,y_preds)) ); f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f.write( "\nOneVsOneClassifier(...)" ); f.write( "\nclf = GradientBoostingClassifier(n_estimators=50,verbose=100)" ); # Now on to final submission X_test = testing.iloc[:,1:] y_final = pd.DataFrame(clf.predict(X_test).reshape([62096,])); numbahs = testing['id'] df = pd.concat([numbahs,y_final],axis=1) df.columns = ['id','country']
# compare two models auc_value roc_auc_score(y_train_5, y_scores) roc_auc_score(y_train_5, y_scores_forest) ## 多类别分类器 # 随机森林和朴素贝叶斯分类器是处理多类别的分类模型 # 而SVM和线性分类器则是严格的二元分类器 # 若使用二元分类器进行多类别分类任务时,它将自动转换OVA(one Vs rest)模式 # 亦可以强制指定OVA或者OVO from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OneVsRestClassifier # 运用OVO + SGDClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(X_train, y_train) ovo_clf.predict([one_digit]) # 直接使用RandomForest forest_clf.fit(X_train, y_train) forest_clf.predict([one_digit]) forest_clf.predict_proba([one_digit]) # 查看该实例在各个类别的概率列表 # 使用交叉验证评估多类别分类器(和二元分类器一致) cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy") # 使用特征缩放后,再进行交叉验证 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaler = scaler.fit_transform(X_train.astype(np.float64)) cross_val_score(sgd_clf, X_train_scaler, y_train, cv=3, scoring="accuracy") #错误分析
train_ingredients.append(' '.join(ings)) #construct test_ingredients for entry in test_set: ings = [WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients']] test_ingredients.append(' '.join(ings)) #used to encode labels as numbers for use with RandomForestClassifier le = LabelEncoder() #encode cuisines as numbers train_cuisines = le.fit_transform(train_cuisines) #used to create bag of ingredients vocabulary and create features for each entry vectorizer = CountVectorizer() train_features = vectorizer.fit_transform(train_ingredients).toarray() test_features = vectorizer.transform(test_ingredients).toarray() clf = OneVsOneClassifier(LinearSVC(random_state=0)).fit(train_features, train_cuisines) result = clf.predict(test_features) output = pd.DataFrame(data={'id':test_ids, 'cuisine':le.inverse_transform(result)}) #force explicit ordering of columns output = output[['id', 'cuisine']] output.to_csv('ovo.csv', index=False)
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer : int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict : int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. multi_class: string, default : "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_ : float The log-marginal-likelihood of ``self.kernel_.theta`` classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data .. versionadded:: 0.18 """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes. Only class %s present." % self.classes_[0]) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean( [estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel( [estimator.kernel_ for estimator in self.base_estimator_.estimators_]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelihood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean( [estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators)]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean( [estimator.log_marginal_likelihood( theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators)]) else: raise ValueError("Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
""" Created on Sun Jun 4 09:20:28 2017 @author: 凯风 """ from sklearn.datasets import load_iris from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import LinearSVR from sklearn.model_selection import train_test_split iris_data = load_iris() X,Y = iris_data.data,iris_data.target trainX,testX,trainY,testY = train_test_split(X,Y,test_size=0.3) ''' One vs One One vs One 的复杂度相对于rest会高些 一对多的策略,如果用在多类里,其实没什么卵用,因为sk里面的现有分类器多数都实现了对多类的处理 ''' clf = LinearSVR(random_state=0) ovrc = OneVsOneClassifier(clf, n_jobs=1) ovrc.fit(trainX,trainY) ovrc.predict(testX) ''' estimator 评估器 n_jobs CPU的作业数量 '''
gamma_arr = [10**exp for exp in range(-8, 8)] best_train = 0.0 best_test = 0.0 best_train_c = None best_test_c = None best_train_gamma = None best_test_gamma = None for c, gamma in itertools.product(c_arr, gamma_arr): if not args.hyper: gamma = 1 c = 5000 if args.hyper: print("Testing hyperparameters C=" + str(c) + ", gamma=" + str(gamma)) svm_model_linear = OneVsOneClassifier(SVC(max_iter=-1, C=c, gamma=gamma, kernel='rbf', class_weight='balanced', cache_size=1000)) svm_model_linear.fit(X_train, y_train) # learn svm_predictions = svm_model_linear.predict(X_test) # predict # model accuracy for X_test. accuracy_train = svm_model_linear.score(X_train, y_train) if accuracy_train > best_train: best_train = accuracy_train best_train_c = c best_train_gamma = gamma accuracy_test = svm_model_linear.score(X_test, y_test) if accuracy_test > best_test: best_test = accuracy_test best_test_c = c best_test_gamma = gamma print("Training accuracy:", accuracy_train) print("Test accuracy:", accuracy_test) if not args.hyper:
x1_test_array = np.array(X_test.iloc[:, 0]) x2_test_array = np.array(X_test.iloc[:, 1]) y_test_array = np.array(y_test) ## TRAINING print("\n===== TRAINING =====") SVM_ovr = OneVsRestClassifier( LinearSVC(max_iter=200000, random_state=RANDOM_STATE)) SVM_ovr.fit(X_train, y_train) SVM_ovo = OneVsOneClassifier( LinearSVC(max_iter=200000, random_state=RANDOM_STATE)) SVM_ovo.fit(X_train, y_train) y_pred_ovr = SVM_ovr.predict(X_test) y_pred_ovo = SVM_ovo.predict(X_test) #print("PREDICTIONS: \n",y_pred) #print("\nY_TEST:\n", y_test_array) print("\nOVR Accuracy: {0:.3f}".format( np.sum((y_pred_ovr == y_test_array)) / y_test.shape[0])) print("OVO Accuracy: {0:.3f}".format( np.sum((y_pred_ovo == y_test_array)) / y_test.shape[0])) ## Plot distributions # 1 2 3 xmin = -3 # -2 -3 -3 xmax = 10 # 10 10 10 ymin = -30 # -25 -30 -30 ymax = 45 # 45 60 90
#test_rect = f20(subset='test',categories=cat[4:], shuffle = True, random_state = 42) vector_test = vectorizer.transform(test.data) tfidf_test=vector_test.toarray() tfidf_test_reduced = svd.transform(tfidf_test) svm_test_data = tfidf_test_reduced svm_test_tag = test.target #for i in test.target: # if(i < 4): # svm_test_tag.append(-1) # else: # svm_test_tag.append(1) svc = SVC(kernel='linear',C = 100) svc_ovoc=OVOC(svc) svc_ovoc.fit(svm_train_data, svm_train_tag) svc_ovoc_predict=svc_ovoc.predict(svm_test_data) #precision, recall, thresholds = precision_recall_curve(svm_test_tag, svc_ovoc_predict) #BernoulliNB(alpha=1.0, binarize=0.5, class_prior=None, fit_prior=True) score=svc_ovoc.score(svm_test_data,svm_test_tag) precision = precision_score(svm_test_tag, svc_ovoc_predict, average = 'weighted') recall = recall_score(svm_test_tag, svc_ovoc_predict, average = 'weighted') print "1 VS 1 SVC" print "confusion matrix:","\n",confusion_matrix(svm_test_tag, svc_ovoc_predict) print "score=",score print "precision=", precision print "recall=", recall print '\n' svc = SVC(kernel='rbf',C = 100) svc_ovrc=OVRC(svc) svc_ovrc.fit(svm_train_data, svm_train_tag)