class ClassifierOvOAsFeatures: """ A transformation that esentially implement a form of dimensionality reduction. This class uses a fast SGDClassifier configured like a linear SVM to produce a vector of decision functions separating target classes in a one-versus-rest fashion. It's useful to reduce the dimension bag-of-words feature-set into features that are richer in information. """ def fit(self, X, y): """ `X` is expected to be an array-like or a sparse matrix. `y` is expected to be an array-like containing the classes to learn. """ self.classifier = OneVsOneClassifier(SGDClassifier(),n_jobs=-1).fit(X,numpy.array(y)) return self def transform(self, X, y=None): """ `X` is expected to be an array-like or a sparse matrix. It returns a dense matrix of shape (n_samples, m_features) where m_features = (n_classes * (n_classes - 1)) / 2 """ return self.classifier.decision_function(X)
def test_ovo_decision_function(): n_samples = iris.data.shape[0] ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0)) # first binary ovo_clf.fit(iris.data, iris.target == 0) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples,)) # then multi-class ovo_clf.fit(iris.data, iris.target) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples, n_classes)) assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data)) # Compute the votes votes = np.zeros((n_samples, n_classes)) k = 0 for i in range(n_classes): for j in range(i + 1, n_classes): pred = ovo_clf.estimators_[k].predict(iris.data) votes[pred == 0, i] += 1 votes[pred == 1, j] += 1 k += 1 # Extract votes and verify assert_array_equal(votes, np.round(decisions)) for class_idx in range(n_classes): # For each sample and each class, there only 3 possible vote levels # because they are only 3 distinct class pairs thus 3 distinct # binary classifiers. # Therefore, sorting predictions based on votes would yield # mostly tied predictions: assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.]))) # The OVO decision function on the other hand is able to resolve # most of the ties on this data as it combines both the vote counts # and the aggregated confidence levels of the binary classifiers # to compute the aggregate decision function. The iris dataset # has 150 samples with a couple of duplicates. The OvO decisions # can resolve most of the ties: assert_greater(len(np.unique(decisions[:, class_idx])), 146)
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None)) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert ovo_prediction[0] == normalized_confidences[0].argmax()
def multi_classification_section(): # Author's note: Use Binary Classicators for multiclass # For N class # * (One vs All strategy) # - Have N Binary classificators. # - Each binary classificator will try to identify one # single digit (e.g. 5) vs the rest. # - Choose class from the binary classificator with the best score. # * (One vs One strategy) # - N*(N-1)/2 classifier, distinguish between: # 0 vs 1, 0 vs 2, ... # 1 vs 2, 1 vs 3, ... # ... # - Choose the one who wins most duels. # - Train each binary classifier with less data: train only # w/ the 2 digits of the duel. # Good for model that scales badly with data size. ova_clf = SGDClassifier( random_state=random_seed ) # by default, some bin. clf will use OvA when multiclasses are detected ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=random_seed)) forest_clf = RandomForestClassifier(random_state=random_seed) ova_clf.fit(X_train, y_train) ovo_clf.fit(X_train, y_train) forest_clf.fit(X_train, y_train) print("for image ", some_digit_index, ":") print("ova prediction= ", ova_clf.predict([some_digit])) # 5 print("ova scores=", ova_clf.decision_function([some_digit])) print("ova argmax(scores)=", np.argmax(ova_clf.decision_function([some_digit]))) print("ova classes=", ova_clf.classes_) print("ovo prediction= ", ovo_clf.predict([some_digit])) # 5 print("ovo scores=", ovo_clf.decision_function([some_digit])) print("ovo argmax(scores)=", np.argmax(ovo_clf.decision_function([some_digit]))) print("forest probs=", forest_clf.predict_proba([some_digit])) print("----") # Cross validation print("ova x-val score=", cross_val_score(ova_clf, X_train, y_train, cv=3, scoring="accuracy")) # Author's note: Simply applying a standard scaler will give a 5% bonus on accuracy scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) print( "ova x-val score (scaled)=", cross_val_score(ova_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")) y_train_pred = cross_val_predict(ova_clf, X_train_scaled, y_train, cv=3) # Display the confussius matrix conf_mx = confusion_matrix(y_train, y_train_pred) plt.matshow(conf_mx, cmap=plt.cm.gray) plt.show() # Display confusion matrix only for errors row_sums = conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = conf_mx / row_sums np.fill_diagonal(norm_conf_mx, 0) plt.matshow(norm_conf_mx, cmap=plt.cm.gray) plt.show() # Diplay some errors cl_a, cl_b = 3, 5 X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)] X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)] X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)] X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)] plt.figure(figsize=(8, 8)) plt.subplot(221) plot_digits(X_aa[:25], images_per_row=5) plt.subplot(222) plot_digits(X_ab[:25], images_per_row=5) plt.subplot(223) plot_digits(X_ba[:25], images_per_row=5) plt.subplot(224) plot_digits(X_bb[:25], images_per_row=5) plt.show()
#%% sgd10.predict([data]) scores=sgd10.decision_function([data]) np.argmax(scores) sgd10.classes_ #%% # 强制 Scikit-Learn 使用 OvO 策略或者 OvA 策略,你可以使用 OneVsOneClassifier 类 # 或者 OneVsRestClassifier 类。 from sklearn.multiclass import OneVsOneClassifier ovo=OneVsOneClassifier(SGDClassifier(random_state=42)) ovo.fit(X_train,y_train) ovo.predict([data]) #%% ovo.decision_function([data]) #%% from sklearn.ensemble import RandomForestClassifier forest=RandomForestClassifier() forest.fit(X_train,y_train) forest.predict_proba([data]) #%% cross_val_score(sgd10, X_train, y_train, cv=3, scoring="accuracy") #%% # 正则化 from sklearn.preprocessing import StandardScaler scales=StandardScaler() x_train_scaled=scales.fit_transform(X_train) # cross_val_score(sgd10, x_train_scaled, y_train, cv=3, scoring="accuracy")
def predict(input_size=100000, select_transform=1, read_database=1, one_vs_one=0, model="LinearSVC", mode="multilable", repeat=0, k=0.8, max_number_of_tags=5, max_iter=100000, use_cache=0): to_print = 0 raw_train_data, raw_train_results = stat.get_trainingdata( input_size, select_transform=select_transform, read_database=read_database, to_print=to_print, mode=mode, repeat=repeat, max_number_of_tags=max_number_of_tags) t0 = time() # k = 0.8 # # print raw_train_data # print raw_train_data # print raw_train_results split_point = int(k * input_size) # print split_point train_data = raw_train_data[0:split_point, :] train_results = raw_train_results[0:split_point] # print train_results # print train_data # print train_results test_data = raw_train_data[split_point:, :] test_results = raw_train_results[split_point:] # print test_results fname_U = "SVD_U.txt" fname_V = "SVD_V.txt" fname_S = "SVD_S.txt" if use_cache == 1: with open(fname_U, 'rb') as f: U = pickle.load(f) with open(fname_V, 'rb') as f: V = pickle.load(f) with open(fname_S, 'rb') as f: s = pickle.load(f) print "Using SVD from file" else: U, s, V = np.linalg.svd(train_data, full_matrices=True) with open(fname_U, 'wb') as f: pickle.dump(U, f) with open(fname_V, 'wb') as f: pickle.dump(V, f) with open(fname_S, 'wb') as f: pickle.dump(s, f) print "Using SVD by calculation" print("SVD decomposition done in %fs" % (time() - t0)) square_sum_s = np.square(s).sum() #not sure if this is the most optimal way for finding the sum of squares temp_sum = 0 count = 0 for i in s: temp_sum += i * i count += 1 if (temp_sum >= 0.9 * square_sum_s): break print "count = " + str(count) x = np.delete(V, np.s_[count::1], 0) processedV = np.transpose(x) train_X = np.dot(train_data, processedV) test_X = np.dot(test_data, processedV) # X = X_raw[0:k*input_size + 1, :] # test_X = X_raw[k*input_size+1:,:] # print "count = "+str(count) # print "V.shape = "+str(V.shape) # print "s.shape = "+str(s.shape) # x = np.delete(V, np.s_[count::1], 0) # print "x.shape = "+str(x.shape) # print "raw_train_data.shape = "+str(raw_train_data) # print "processedV.shape = "+str(processedV.shape) #can use splicing instead of delete # print "X.shape = "+str(X.shape) # train_results = stat.get_trainmatrix(input_size, read_database = read_database, to_print = to_print) mlb = MultiLabelBinarizer() trainingdata_results = mlb.fit_transform(raw_train_results) # print train_results train_Y = trainingdata_results[0:split_point, :] test_Y = trainingdata_results[split_point + 1:, :] # print train_Y # test_Y = mlb.fit_transform(test_results) # print test_results # print Y.shape # test_X = X[0:k*input_size,:] # print train_X # print train_Y # print train_results if (one_vs_one == 1): clf = OneVsOneClassifier( svm.LinearSVC(random_state=0, max_iter=10000, verbose=0)) prediction_Y = clf.fit(X, Y).predict(X) else: if model == "LinearSVC": print "Showing Results for one vs rest multilabel classifier using LinearSVC model" clf = OneVsRestClassifier( svm.LinearSVC(random_state=0, dual=True, max_iter=max_iter, verbose=0, C=0.001, loss="squared_hinge", multi_class="crammer_singer")) elif model == "SVC": print "Showing Results for one vs rest multilabel classifier using SVC model" clf = OneVsRestClassifier( svm.SVC(C=0.001, kernel='poly', max_iter=max_iter, verbose=0, degree=3)) clf.fit(train_X, train_Y) print clf.get_params scores = clf.decision_function(test_X) scores_train = clf.decision_function(train_X) indices = scores.argmax(axis=1) indices_train = scores_train.argmax(axis=1) prediction_Y = np.zeros(scores.shape) prediction_train = np.zeros(scores_train.shape) # print prediction_Y.shape for i in range(0, len(indices)): prediction_Y[i][indices[i]] = 1 for i in range(0, len(indices_train)): prediction_train[i][indices_train[i]] = 1 #class sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000 #class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) prediction = mlb.inverse_transform(prediction_Y) # print prediction # for i in prediction: # print i # print "\n" # for i in test_results: # print i # print clf.decision_function(test_X) # # # # print Y # print test_Y print "Testing Error : " evaluate.accuracy_atleast_one_match(test_results, prediction) evaluate.accuracy_null_results(prediction) evaluate.accuracy_exact_match(test_results, prediction) evaluate.accuracy_multilabel(test_results, prediction) evaluate.precision_multilabel(test_results, prediction) evaluate.recall_multilabel(test_results, prediction) evaluate.hamming_loss_multilabel(test_results, prediction) # print train_results # print prediction # print prediction print "Training Error : " prediction = mlb.inverse_transform(prediction_train) # for i in prediction: # print i # print "\n" # for i in test_results: # print i # print clf.decision_function(test_X) # # # # print Y # print test_Y # print prediction_Y evaluate.accuracy_atleast_one_match(train_results, prediction) evaluate.accuracy_null_results(prediction) evaluate.accuracy_exact_match(train_results, prediction) evaluate.accuracy_multilabel(train_results, prediction) evaluate.precision_multilabel(train_results, prediction) evaluate.recall_multilabel(train_results, prediction) evaluate.hamming_loss_multilabel(train_results, prediction)
def multiclass_classification(): ## 二分类的分类器 —— 线性分类器、SVM # 默认是SVM, 默认也是OvA, 训练是十个分类器,选择max的score sgd_clf = SGDClassifier(random_state=42, max_iter=100, tol=1e-3) sgd_clf.fit(X_train, y_train) print('OvA所有的类别:', sgd_clf.classes_) print(sgd_clf.predict([some_digit]), '每个分类的概率值:', sgd_clf.decision_function([some_digit])) # 强制设定为OVO,一共会生成n*(n-1)/2个分类器 ovo_clf = OneVsOneClassifier( SGDClassifier(max_iter=100, random_state=42, tol=1e-3)) ovo_clf.fit(X_train, y_train) print('OVO所有的类别:', ovo_clf.classes_, '分类器总数:', len(ovo_clf.estimators_)) print(ovo_clf.predict([some_digit]), '每个分类的概率值:', ovo_clf.decision_function([some_digit])) ## 多分类的分类器 —— 随机森林、贝叶斯 forest_clf = RandomForestClassifier(random_state=42, n_estimators=10) forest_clf.fit(X_train, y_train) print('随机森林预测:', forest_clf.predict([some_digit]), '每个分类的概率值:', forest_clf.predict_proba([some_digit])) # 稍微加上正规化处理一下特征我们的准确率就涨了 print('特征处理之前的准确率:', cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")) X_train_scaled = StandardScaler().fit_transform(X_train.astype(np.float64)) print( '特征处理之后的准确率:', cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")) # 多分类的交叉矩阵 y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) print(conf_mx) # plt.matshow(conf_mx, cmap=plt.cm.gray) # plt.show() row_sums = conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = conf_mx / row_sums np.fill_diagonal(norm_conf_mx, 0) # 将主对角线都设置成0 预测出错的数据就被凸显了 print(norm_conf_mx) plt.matshow(norm_conf_mx, cmap=plt.cm.gray) plt.show() # 将预测出错的数据单独拉出来分析 cl_a, cl_b = 1, 8 X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)] # TT X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)] # TF X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)] # FT X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)] # FF plt.figure(figsize=(8, 8)) plt.subplot(221) plot_digits(X_bb[:25], images_per_row=5) plt.subplot(222) plot_digits(X_ba[:25], images_per_row=5) plt.subplot(223) plot_digits(X_ab[:25], images_per_row=5) plt.subplot(224) plot_digits(X_aa[:25], images_per_row=5) plt.show()
import pandas as pd from sklearn.datasets import load_iris import matplotlib.pyplot as plt from sklearn.multiclass import OneVsOneClassifier from sklearn.linear_model import LogisticRegression from matplotlib import font_manager, rc font_name = font_manager.FontProperties( fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) plt.rcParams['axes.unicode_minus'] = False iris = load_iris() # 각특성을 1:1 로 비교 model_ovo = OneVsOneClassifier(LogisticRegression(solver='lbfgs')).fit( iris.data, iris.target) print(model_ovo.decision_function(iris.data)) # 구분함수 ax1 = plt.subplot(211) # [2,1] 형태의 창에 1번째 # pd.DataFrame(model_ovo.decision_function(iris.data)).plot(ax=ax1, legend=True) plt.title('판별함수') ax2 = plt.subplot(212) # 훈련 결과에 실데이터를 적용하여 판정 pd.DataFrame(model_ovo.predict(iris.data), columns=["prediction"]).plot(marker='o', ls='', ax=ax2) plt.title("클래스 판별") plt.tight_layout() plt.show()
plt.savefig("cf" + str(normalize) + ".png") pipeline1 = Pipeline([ ('vect', CountVectorizer(min_df=5, stop_words=text.ENGLISH_STOP_WORDS)), ('tfidf', TfidfTransformer()), ('reduce_dim', get_svd()), ]) train_lsi, test_lsi = fetchLSIRepresentation(pipeline1, twenty_train, twenty_test) ovo_svc = OneVsOneClassifier(svm.SVC(kernel='linear', probability=True, C=1000)) ovo_svc.fit(train_lsi, train_target_group) predicted = ovo_svc.predict(test_lsi) print_statistics(test_target_group, predicted) predicted_probs = ovo_svc.decision_function(test_lsi) #fpr, tpr, _ = roc_curve(test_target_group, predicted_probs) #plot_roc(fpr, tpr, penalty) cnf_matrix = smet.confusion_matrix(test_target_group, predicted) plot_confusion_matrix(cnf_matrix, classes=classCategories, title='Multiclass Confusion matrix') #plot_confusion_matrix(cnf_matrix, classes=classCategories, normalize=True, title='Confusion matrix with normalization' )
def svmfit(self): data = self.all_features() shortfiles = [ 'v.wav.csv', 'Yeye.wav.csv', 'myohmy.wav.csv', 'mymymy.wav.csv', 'mememe.wav.csv' ] total_length = 200 trunc = int(total_length * 0.7) ver = total_length - trunc num_of_features = 2 num_of_samples = len(shortfiles) np.random.seed(4) # save statistics stats = {} for i in range(len(self.header)): v = np.asarray([data['v.wav.csv'][self.header[i]][0:total_length] ]).reshape((total_length, 1)) np.nan_to_num(v) np.random.shuffle(v) yeye = np.asarray([ data['Yeye.wav.csv'][self.header[i]][0:total_length] ]).reshape((total_length, 1)) np.nan_to_num(yeye) np.random.shuffle(yeye) myohmy = np.asarray([ data['myohmy.wav.csv'][self.header[i]][0:total_length] ]).reshape((total_length, 1)) np.nan_to_num(myohmy) np.random.shuffle(myohmy) mymymy = np.asarray([ data['mymymy.wav.csv'][self.header[i]][0:total_length] ]).reshape((total_length, 1)) np.nan_to_num(mymymy) np.random.shuffle(mymymy) mememe = np.asarray([ data['mememe.wav.csv'][self.header[i]][0:total_length] ]).reshape((total_length, 1)) np.nan_to_num(mememe) np.random.shuffle(mememe) # Re-shuffle data v_tr = v[0:trunc, :] yeye_tr = yeye[0:trunc, :] myohmy_tr = myohmy[0:trunc, :] mymymy_tr = mymymy[0:trunc, :] mememe_tr = mememe[0:trunc, :] v_ts = v[trunc:total_length, :] yeye_ts = yeye[trunc:total_length] myohmy_ts = myohmy[trunc:total_length] mymymy_ts = mymymy[trunc:total_length] mememe_ts = mememe[trunc:total_length] X = np.zeros([trunc * num_of_samples, num_of_features]) X_ts = np.zeros([ver * num_of_samples, num_of_features]) y = np.zeros([len(X), 1]) y_ts = np.zeros([len(X_ts), 1]) selected_data = [v_tr, yeye_tr, myohmy_tr, mymymy_tr, mememe_tr] test_data = [v_ts, yeye_ts, myohmy_ts, mymymy_ts, mememe_ts] init_tr = 0 init_ts = 0 for j in range(len(selected_data)): X[init_tr:(j + 1) * trunc, 0:2] = selected_data[j] X_ts[init_ts:(j + 1) * ver, 0:2] = test_data[j] y[init_tr:(j + 1) * trunc, 0] = j y_ts[init_ts:(j + 1) * ver, 0] = j init_tr = trunc * (j + 1) init_ts = ver * (j + 1) # change y, y_ts back to 1-dimension y = y.reshape((len(X), )) y_ts = y_ts.reshape((len(X_ts, ))) clf = OneVsOneClassifier( SVC(C=1, cache_size=400, coef0=0.0, degree=5, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=.01, verbose=False), -1).fit(X, y) pred = clf.predict(X_ts) dec_func = pd.DataFrame( OneVsOneClassifier.decision_function(clf, X_ts)) rmse, corr = self.calculate_stats(y_ts, pred) accuracy = float(sum(y_ts == pred)) / len(y_ts) stats.update({self.header[i]: [accuracy, rmse, corr[0, 1]]}) # print "accuracy: ", accuracy # print "rmse: ", rmse # print "corr: ", corr[0,1] #dec_func.plot() #plt.show() print stats
# -864502.26667054, -245167.9063152 , -149510.01775103, # -233700.77221455]]) #argmax gives max values of scores np.argmax(scores) sgd.classes_ #array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U1') from sklearn.multiclass import OneVsOneClassifier ovo = OneVsOneClassifier(SGDClassifier(random_state=100)) ovo.fit(data_train, target_train) ovo.predict([some_digit]) len(ovo.estimators_) ovo.decision_function([some_digit]) #array([[ 1.5 , 4.01086892, 0.50210079, 5.22484016, 8.31545536, # 5.11411311, -0.43998285, 5.13308383, 7.3219439 , 8.3175768 ]]) cross_val_score(sgd, data_train, target_train, cv=3, scoring='accuracy') #array([0.86552689, 0.86179309, 0.86117918]) import pandas as pd predict_m = cross_val_predict(sgd, data_train, target_train, cv=3) ps = precision_score(target_train, predict_m, average=None) rs = recall_score(target_train, predict_m, average=None) df = pd.DataFrame({'precision': ps, 'recall': rs})
class SeCoEstimator(BaseEstimator, ClassifierMixin): """A classifier using rules learned with the *Separate-and-Conquer* (SeCo) algorithm, also known as *Covering* algorithm. Wraps `_BaseSeCoEstimator` to handle multi-class problems, selecting a multi-class strategy and making sure that `_BaseSeCoEstimator` always sees an integer range [0..n_classes_) of class labels, where 0 is the intended fallback class; i.e. the biggest class in multi-class problems, or the negative class when learning a binary concept. The concrete SeCo variant to run is defined by `algorithm_config`. Fields ----- algorithm_config : subclass of SeCoAlgorithmConfiguration Defines the concrete SeCo algorithm to run, see :class:`SeCoAlgorithmConfiguration`. Parameters ----- multi_class : callable or str or None Which strategy to use for non-binary problems. Possible values: - None: auto-select; use 'direct' if possible (`algorithm_config.direct_multiclass_support()` returns True), 'one_vs_rest' otherwise. - A callable: Construct `self.base_estimator_ = multi_class(_BaseSeCoEstimator())` and delegate to that estimator. Useful if you want to roll a different binarization strategy, e.g. >>> import sklearn.multiclass, functools >>> multi_class=functools.partial( ... sklearn.multiclass.OutputCodeClassifier, ... code_size=0.7, random_state=42) If you use this, make sure to pass to `_BaseSeCoEstimator` classes `y` from an integer range [0..n_classes_), e.g. using `LabelEncoder`. Also be aware of class order influence on tie-breaking. - 'direct': Directly learn a theory of rules with different heads (target classes). Uses :class:`BySizeLabelEncoder` internally. - 'one_vs_rest': Use `sklearn.multiclass.OneVsRestClassifier` for class binarization and learn binary theories. - 'one_vs_one': Use `sklearn.multiclass.OneVsOneClassifier` for class binarization and learn binary theories. - TODO: multi_class strategy of ripper: OneVsRest, remove C_i after learning rules for it random_state : None | int | instance of np.random.RandomState RNG, may be used by the algorithm. Value passed through `sklearn.utils.check_random_state`. n_jobs : int, optional Passed to `OneVsRestClassifier` or `OneVsOneClassifier` if these are used. Attributes ----- base_estimator_ : estimator instance The estimator object that all tasks are delegated to. One of `sklearn.multiclass.OneVsRestClassifier`, `sklearn.multiclass.OneVsOneClassifier` or `sklearn_seco.util.TargetTransformingMetaEstimator` if demanded by the `multi_class_` strategy, a `_BaseSeCoEstimator` otherwise. multi_class_ : callable or str The actual strategy used on a non-binary problem. Relevant if `multi_class=None` demanded auto-selection. classes_ : np.ndarray `np.unique(y)` See Also ----- `_BaseSeCoEstimator` """ algorithm_config: Type[SeCoAlgorithmConfiguration] # TODO: _BaseSeCoEstimator.export_text equivalent inverting binarization & target transformation for display def _more_tags(self): # tell sklearn >= 0.21 that we can handle categorical data return {'X_types': ['2darray', 'categorical'], 'allow_nan': True} def __init__(self, multi_class=None, random_state=1, n_jobs=1): self.multi_class = multi_class self.random_state = random_state self.n_jobs = n_jobs def fit(self, X, y, **kwargs): """Learn SeCo theory/theories on training data `X, y`. For possible parameters (`**kwargs`), refer to :class:`_BaseSeCoEstimator`. """ X, y = check_X_y(X, y, force_all_finite='allow-nan') self.multi_class_ = self.multi_class self.base_estimator_ = _BaseSeCoEstimator( self.algorithm_config, random_state=self.random_state, **kwargs) # NOTE: if using multiprocessing (e.g. through OvO or OvR), all # sub-estimators share the same random seed/state. # I think this should not harm. def wrapper_ordering_classes_by_size(estimator): # BySizeLabelEncoder ensures: first class = default = biggest # and that classes form an integer range [0..n_classes_) return TargetTransformingMetaEstimator(BySizeLabelEncoder(), estimator) self.classes_ = np.unique(y) n_classes_ = self.classes_.size if n_classes_ == 1: raise ValueError("SeCoEstimator requires 2 or more distinct " "classes. Only 1 class (%s) present." % self.classes_[0]) elif n_classes_ == 2: self.base_estimator_ = wrapper_ordering_classes_by_size( self.base_estimator_) else: # n_classes_ > 2 if self.multi_class_ is None: # default / auto-selection if self.algorithm_config.direct_multiclass_support(): self.multi_class_ = "direct" else: self.multi_class_ = "one_vs_rest" if callable(self.multi_class_): self.base_estimator_ = self.multi_class_(self.base_estimator_) elif self.multi_class_ == "one_vs_rest": self.base_estimator_ = OneVsRestClassifier( self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class_ == "one_vs_one": self.base_estimator_ = OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class_ == "direct": # TODO: if self.multi_class=='direct' (not `None` auto-detect), only assertion prevents binary-only learner to silently learn on multiclass training data self.base_estimator_ = wrapper_ordering_classes_by_size( self.base_estimator_) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class_) # NOTE: param categorical_features is data dependent, but OvR/OvO don't # pass extra parameters through fit(), so it has to be in # `_BaseSeCoEstimator.__init__`. self.base_estimator_.fit(X, y) return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_"]) X = check_array(X, force_all_finite='allow-nan') return self.base_estimator_.predict(X) @if_delegate_has_method('base_estimator_') def predict_proba(self, X): # noinspection PyUnresolvedReferences return self.base_estimator_.predict_proba(X) @if_delegate_has_method('base_estimator_') def decision_function(self, X): # noinspection PyUnresolvedReferences return self.base_estimator_.decision_function(X) def get_seco_estimators(self) -> Sequence[_BaseSeCoEstimator]: """ :return: The `_BaseSeCoEstimator` instances that were trained. Depending on the multi-class strategy, the class labels they use differ in order and value. Cannot be used when self.multi_class_ is a callable. """ check_is_fitted(self, 'base_estimator_') is_binary = len(self.classes_) == 2 if is_binary or self.multi_class_ == "direct": assert isinstance(self.base_estimator_, TargetTransformingMetaEstimator) return [self.base_estimator_.estimator] elif self.multi_class_ == "one_vs_rest": assert isinstance(self.base_estimator_, OneVsRestClassifier) return self.base_estimator_.estimators_ elif self.multi_class_ == "one_vs_one": assert isinstance(self.base_estimator_, OneVsOneClassifier) return self.base_estimator_.estimators_ else: assert False, "invalid state: unknown type of base_estimator_ " \ f"({str(self.base_estimator_)})"
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin): """Algorithm for robust classification using reweighting algorithm. This model use iterative reweighting of samples to make a regression or classification estimator robust. The principle of the algorithm is to use an empirical risk minimization principle where the risk is estimated using a robust estimator (for example Huber estimator or median-of-means estimator)[1], [3]. The idea behind this algorithm was mentioned before in [2]. This idea translates in an iterative algorithm where the sample_weight are changed at each iterations and are dependent of the sample. Informally the outliers should have small weight while the inliers should have big weight, where outliers are sample with a big loss function. This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily bad outliers). When the "mom" weighting scheme is used, k outliers can be tolerated. When the "Huber" weighting scheme is used, asymptotically the number of outliers has to be less than half the sample size. Read more in the :ref:`User Guide <robust>`. Parameters ---------- weighting : string, default="huber" Weighting scheme used to make the estimator robust. Can be 'huber' for huber-type weights or 'mom' for median-of-means type weights. max_iter : int, default=100 Maximum number of iterations. For more information, see the optimization scheme of base_estimator and the eta0 and burn_in parameter. burn_in : int, default=10 Number of steps used without changing the learning rate. Can be useful to make the weight estimation better at the beginning. eta0 : float, default=0.01 Constant step-size used during the burn_in period. Used only if burn_in>0. Can have a big effect on efficiency. c : float>0 or None, default=None Parameter used for Huber weighting procedure, used only if weightings is 'huber'. Measure the robustness of the weighting procedure. A small value of c means a more robust estimator. Can have a big effect on efficiency. If None, c is estimated at each step using half the Inter-quartile range, this tends to be conservative (robust). k : int < sample_size/2, default=1 Parameter used for mom weighting procedure, used only if weightings is 'mom'. 2k+1 is the number of blocks used for median-of-means estimation, higher value of k means a more robust estimator. Can have a big effect on efficiency. If None, k is estimated using the number of points distant from the median of means of more than 2 times a robust estimate of the scale (using the inter-quartile range), this tends to be conservative (robust). loss : string, None or callable, default="log" Name of the loss used, must be the same loss as the one optimized in base_estimator. Classification losses supported : 'log', 'hinge'. If 'log', then the base_estimator must support predict_proba. Regression losses supported : 'squared_loss', . sgd_args : dict, default={} arguments of the SGDClassifier base estimator. multi_class : string, default="ovr" multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr" for OneVsRestClassifier or "binary" for binary classification. n_jobs : int, default=1 number of jobs used in the multi-class meta-algorithm computation. tol : float or None, (default = 1e-3) The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for n_iter_no_change consecutive epochs. n_iter_no_change : int, default=10 Number of iterations with no improvement to wait before early stopping. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Attributes ---------- classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. Only available if multi_class = "binary" intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. Only available if multi_class = "binary" n_iter_ : ndarray of shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. base_estimator_ : object, The fitted base estimator SGDCLassifier. weights_ : array like, length = n_sample. Weight of each sample at the end of the algorithm. Can be used as a measure of how much of an outlier a sample is. Only available if multi_class = "binary" Notes ----- Often, there is a need to use RobustScaler as preprocessing. Examples -------- >>> from sklearn_extra.robust import RobustWeightedClassifier >>> from sklearn.datasets import make_blobs >>> import numpy as np >>> rng = np.random.RandomState(42) >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]), ... random_state=rng) >>> clf=RobustWeightedClassifier() >>> _ = clf.fit(X, y) >>> score = np.mean(clf.predict(X)==y) References ---------- [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu. "Robust classification via MOM minimization", Mach Learn 109, (2020). https://doi.org/10.1007/s10994-019-05863-6 (2018). arXiv:1808.03106 [2] Christian Brownlees, Emilien Joly and Gábor Lugosi. "Empirical risk minimization for heavy-tailed losses", Ann. Statist. Volume 43, Number 6 (2015), 2507-2536. [3] Stanislav Minsker and Timothée Mathieu. "Excess risk bounds in robust empirical risk minimization" arXiv preprint (2019). arXiv:1910.07485. """ def __init__( self, weighting="huber", max_iter=100, burn_in=10, eta0=0.01, c=None, k=0, loss="log", sgd_args=None, multi_class="ovr", n_jobs=1, tol=1e-3, n_iter_no_change=10, random_state=None, ): self.weighting = weighting self.max_iter = max_iter self.burn_in = burn_in self.eta0 = eta0 self.c = c self.k = k self.loss = loss self.sgd_args = sgd_args self.multi_class = multi_class self.n_jobs = n_jobs self.tol = tol self.n_iter_no_change = n_iter_no_change self.random_state = random_state def fit(self, X, y): """Fit the model to data matrix X and target(s) y. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). Returns ------- self : returns an estimator trained with RobustWeightedClassifier. """ if self.sgd_args is None: sgd_args = {} else: sgd_args = self.sgd_args # Define the base estimator base_robust_estimator_ = _RobustWeightedEstimator( SGDClassifier(**sgd_args, loss=self.loss), weighting=self.weighting, loss=self.loss, burn_in=self.burn_in, c=self.c, k=self.k, eta0=self.eta0, max_iter=self.max_iter, tol=self.tol, n_iter_no_change=self.n_iter_no_change, random_state=self.random_state, ) if self.multi_class == "ovr": self.base_estimator_ = OneVsRestClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) elif self.multi_class == "binary": self.base_estimator_ = base_robust_estimator_ elif self.multi_class == "ovo": self.base_estimator_ = OneVsOneClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) else: raise ValueError("No such multiclass method implemented.") self.base_estimator_.fit(X, y) if self.multi_class == "binary": self.weights_ = self.base_estimator_.weights_ self.coef_ = self.base_estimator_.coef_ self.intercept_ = self.base_estimator_.intercept_ self.n_iter_ = self.max_iter * len(X) self.classes_ = self.base_estimator_.classes_ return self def predict(self, X): """Predict using the estimator trained with RobustWeightedClassifier. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y : array-like, shape (n_samples, n_outputs) The predicted values. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.predict(X) def _check_proba(self): if self.loss != "log": raise AttributeError( "Probability estimates are not available for" " loss=%r" % self.loss ) @property def predict_proba(self): """ Probability estimates when binary classification. Parameters ---------- X : array-like of shape (n_samples, n_features) Vector to be scored, where `n_samples` is the number of samples and `n_features` is the number of features. Returns ------- T : array-like of shape (n_samples, n_classes) Returns the probability of the sample for each class in the model, """ check_is_fitted(self, attributes=["base_estimator_"]) self._check_proba() return self._predict_proba def _predict_proba(self, X): return self.base_estimator_.predict_proba(X) @property def _estimator_type(self): return self.base_estimator._estimator_type def score(self, X, y=None): """Returns the score on the given data, using ``base_estimator_.score``. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples, n_output) or (n_samples,), optional Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.score(X, y) def decision_function(self, X): """Predict using the linear model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Returns ------- array, shape (n_samples,) Predicted target values per element in X. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.decision_function(X)
class LSVMDetector: # just the training() function changes, rest all remains same. def __init__(self, subjects, data, attacker_data): self.data = data self.attacker = attacker_data self.u_scores = [] self.i_scores = [] self.mean_vector = [] self.subjects = subjects self.fp = [] def training(self): self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto')) labels = [0] * len(self.train) + [1] * len(self.train_imposter) self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels) def testing(self): self.u_scores = self.clf.decision_function(self.test_genuine) self.i_scores = self.clf.decision_function(self.test_imposter) self.u_scores = list(self.u_scores) self.i_scores = list(self.i_scores) def evaluate(self): eers = [] fpr = [] if isinstance(self.subjects, list): for idx, subject in enumerate(self.subjects): genuine_user_data = self.data.loc[self.data.user_id == subject, \ ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']] imposter_data = self.data.loc[self.data.user_id != subject, :] # generated_data = attacker_data genuine_user_data = normalize_df(genuine_user_data[:400]) self.train = genuine_user_data[:200] self.test_genuine = genuine_user_data[200:400] # self.test_imposter = normalize_np(self.attacker[idx]) # self.test_imposter = normalize_df(imposter_data.groupby("user_id"). \ # head(10).loc[:, # ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', # 'length of trajectory', 'mid-stroke pressure', # 'mid-stroke area covered', # '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', # '20\%-perc. dev. from end-to-end line', # '50\%-perc. dev. from end-to-end line', # '80\%-perc. dev. from end-to-end line']]) self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \ tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']]) self.test_imposter = self.attacker[idx] self.training() self.testing() # eers.append(evaluateEER(self.u_scores, \ # self.i_scores)) fpr.append(evaluateFAR(self.u_scores, self.i_scores)) # print(evaluateFAR(self.u_scores, self.i_scores)) else: genuine_user_data = self.data.loc[self.data.user_id == self.subjects, \ ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']] imposter_data = self.data.loc[ self.data.user_id != self.subjects, :] # generated_data = attacker_data genuine_user_data = normalize_df(genuine_user_data[:400]) self.train = genuine_user_data[:200] self.test_genuine = genuine_user_data[200:400] # self.test_imposter = imposter_data.groupby("subject"). \ # tail(6).loc[:, "H.period":"H.Return"] # self.test_imposter = normalize_np(self.attacker) self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \ tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']]) self.test_imposter = self.attacker self.training() self.testing() # eers.append(evaluateEER(self.u_scores, \ # self.i_scores)) fpr.append(evaluateFAR(self.u_scores, self.i_scores)) return np.mean(fpr)