Python OneVsOneClassifier.decision_function示例，sklearn.multiclass.OneVsOneClassifier.decision_function Python示例

示例#1

0

显示文件

文件： transformations.py 项目： EspenAlbert/sentimentAnalysisMovieReviews

class ClassifierOvOAsFeatures:
    """
    A transformation that esentially implement a form of dimensionality
    reduction.
    This class uses a fast SGDClassifier configured like a linear SVM to produce
    a vector of decision functions separating target classes in a
    one-versus-rest fashion.
    It's useful to reduce the dimension bag-of-words feature-set into features
    that are richer in information.
    """
    def fit(self, X, y):
        """
        `X` is expected to be an array-like or a sparse matrix.
        `y` is expected to be an array-like containing the classes to learn.
        """
        self.classifier = OneVsOneClassifier(SGDClassifier(),n_jobs=-1).fit(X,numpy.array(y))
        return self

    def transform(self, X, y=None):
        """
        `X` is expected to be an array-like or a sparse matrix.
        It returns a dense matrix of shape (n_samples, m_features) where
            m_features = (n_classes * (n_classes - 1)) / 2
        """
        return self.classifier.decision_function(X)

示例#2

0

显示文件

文件： test_multiclass.py 项目： AlexisMignon/scikit-learn

def test_ovo_decision_function():
    n_samples = iris.data.shape[0]

    ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
    # first binary
    ovo_clf.fit(iris.data, iris.target == 0)
    decisions = ovo_clf.decision_function(iris.data)
    assert_equal(decisions.shape, (n_samples,))

    # then multi-class
    ovo_clf.fit(iris.data, iris.target)
    decisions = ovo_clf.decision_function(iris.data)

    assert_equal(decisions.shape, (n_samples, n_classes))
    assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))

    # Compute the votes
    votes = np.zeros((n_samples, n_classes))

    k = 0
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            pred = ovo_clf.estimators_[k].predict(iris.data)
            votes[pred == 0, i] += 1
            votes[pred == 1, j] += 1
            k += 1

    # Extract votes and verify
    assert_array_equal(votes, np.round(decisions))

    for class_idx in range(n_classes):
        # For each sample and each class, there only 3 possible vote levels
        # because they are only 3 distinct class pairs thus 3 distinct
        # binary classifiers.
        # Therefore, sorting predictions based on votes would yield
        # mostly tied predictions:
        assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.])))

        # The OVO decision function on the other hand is able to resolve
        # most of the ties on this data as it combines both the vote counts
        # and the aggregated confidence levels of the binary classifiers
        # to compute the aggregate decision function. The iris dataset
        # has 150 samples with a couple of duplicates. The OvO decisions
        # can resolve most of the ties:
        assert_greater(len(np.unique(decisions[:, class_idx])), 146)

示例#3

0

显示文件

文件： test_multiclass.py 项目： Anuragch/scikit-learn

def test_ovo_ties():
    # Test that ties are broken using the decision function,
    # not defaulting to the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron())
    ovo_prediction = multi_clf.fit(X, y).predict(X)
    ovo_decision = multi_clf.decision_function(X)

    # Classifiers are in order 0-1, 0-2, 1-2
    # Use decision_function to compute the votes and the normalized
    # sum_of_confidences, which is used to disambiguate when there is a tie in
    # votes.
    votes = np.round(ovo_decision)
    normalized_confidences = ovo_decision - votes

    # For the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # For the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # For the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())

示例#4

0

显示文件

文件： test_multiclass.py 项目： trekwang1/cpx_processor

def test_ovo_ties():
    # Test that ties are broken using the decision function,
    # not defaulting to the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4,
                                              tol=None))
    ovo_prediction = multi_clf.fit(X, y).predict(X)
    ovo_decision = multi_clf.decision_function(X)

    # Classifiers are in order 0-1, 0-2, 1-2
    # Use decision_function to compute the votes and the normalized
    # sum_of_confidences, which is used to disambiguate when there is a tie in
    # votes.
    votes = np.round(ovo_decision)
    normalized_confidences = ovo_decision - votes

    # For the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # For the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # For the tie, the prediction is the class with the highest score
    assert ovo_prediction[0] == normalized_confidences[0].argmax()

示例#5

0

显示文件

文件： ch3-mnist.py 项目： btljuice/math

def multi_classification_section():
    # Author's note: Use Binary Classicators for multiclass
    # For N class
    # * (One vs All strategy)
    #   - Have N Binary classificators.
    #   - Each binary classificator will try to identify one
    #     single digit (e.g. 5) vs the rest.
    #   - Choose class from the binary classificator with the best score.
    # * (One vs One strategy)
    #   - N*(N-1)/2 classifier, distinguish between:
    #     0 vs 1, 0 vs 2, ...
    #     1 vs 2, 1 vs 3, ...
    #     ...
    #   - Choose the one who wins most duels.
    #   - Train each binary classifier with less data: train only
    #     w/ the 2 digits of the duel.
    #     Good for model that scales badly with data size.
    ova_clf = SGDClassifier(
        random_state=random_seed
    )  # by default, some bin. clf will use OvA when multiclasses are detected
    ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=random_seed))
    forest_clf = RandomForestClassifier(random_state=random_seed)
    ova_clf.fit(X_train, y_train)
    ovo_clf.fit(X_train, y_train)
    forest_clf.fit(X_train, y_train)
    print("for image ", some_digit_index, ":")
    print("ova prediction= ", ova_clf.predict([some_digit]))  # 5
    print("ova scores=", ova_clf.decision_function([some_digit]))
    print("ova argmax(scores)=",
          np.argmax(ova_clf.decision_function([some_digit])))
    print("ova classes=", ova_clf.classes_)
    print("ovo prediction= ", ovo_clf.predict([some_digit]))  # 5
    print("ovo scores=", ovo_clf.decision_function([some_digit]))
    print("ovo argmax(scores)=",
          np.argmax(ovo_clf.decision_function([some_digit])))
    print("forest probs=", forest_clf.predict_proba([some_digit]))
    print("----")

    # Cross validation
    print("ova x-val score=",
          cross_val_score(ova_clf, X_train, y_train, cv=3, scoring="accuracy"))
    # Author's note: Simply applying a standard scaler will give a 5% bonus on accuracy
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
    print(
        "ova x-val score (scaled)=",
        cross_val_score(ova_clf,
                        X_train_scaled,
                        y_train,
                        cv=3,
                        scoring="accuracy"))
    y_train_pred = cross_val_predict(ova_clf, X_train_scaled, y_train, cv=3)

    # Display the confussius matrix
    conf_mx = confusion_matrix(y_train, y_train_pred)
    plt.matshow(conf_mx, cmap=plt.cm.gray)
    plt.show()

    # Display confusion matrix only for errors
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    np.fill_diagonal(norm_conf_mx, 0)
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    plt.show()

    # Diplay some errors
    cl_a, cl_b = 3, 5
    X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
    X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
    X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
    X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

    plt.figure(figsize=(8, 8))
    plt.subplot(221)
    plot_digits(X_aa[:25], images_per_row=5)
    plt.subplot(222)
    plot_digits(X_ab[:25], images_per_row=5)
    plt.subplot(223)
    plot_digits(X_ba[:25], images_per_row=5)
    plt.subplot(224)
    plot_digits(X_bb[:25], images_per_row=5)
    plt.show()

示例#6

0

显示文件

文件： mnist_code.py 项目： botianzhe/ml

#%%
sgd10.predict([data])
scores=sgd10.decision_function([data])
np.argmax(scores)
sgd10.classes_
#%%
# 强制 Scikit-Learn 使用 OvO 策略或者 OvA 策略，你可以使用 OneVsOneClassifier  类
# 或者 OneVsRestClassifier  类。
from sklearn.multiclass import OneVsOneClassifier
ovo=OneVsOneClassifier(SGDClassifier(random_state=42))
ovo.fit(X_train,y_train)
ovo.predict([data])

#%%
ovo.decision_function([data])

#%%
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier()
forest.fit(X_train,y_train)
forest.predict_proba([data])
#%%
 cross_val_score(sgd10, X_train, y_train, cv=3, scoring="accuracy")

#%%
# 正则化
from sklearn.preprocessing import StandardScaler
scales=StandardScaler()
x_train_scaled=scales.fit_transform(X_train)
# cross_val_score(sgd10, x_train_scaled, y_train, cv=3, scoring="accuracy")

示例#7

0

显示文件

def predict(input_size=100000,
            select_transform=1,
            read_database=1,
            one_vs_one=0,
            model="LinearSVC",
            mode="multilable",
            repeat=0,
            k=0.8,
            max_number_of_tags=5,
            max_iter=100000,
            use_cache=0):

    to_print = 0
    raw_train_data, raw_train_results = stat.get_trainingdata(
        input_size,
        select_transform=select_transform,
        read_database=read_database,
        to_print=to_print,
        mode=mode,
        repeat=repeat,
        max_number_of_tags=max_number_of_tags)
    t0 = time()
    # k = 0.8

    # # print raw_train_data
    # print raw_train_data
    # print raw_train_results

    split_point = int(k * input_size)
    # print split_point
    train_data = raw_train_data[0:split_point, :]
    train_results = raw_train_results[0:split_point]
    # print train_results
    # print train_data
    # print train_results

    test_data = raw_train_data[split_point:, :]
    test_results = raw_train_results[split_point:]
    # print test_results

    fname_U = "SVD_U.txt"
    fname_V = "SVD_V.txt"
    fname_S = "SVD_S.txt"

    if use_cache == 1:
        with open(fname_U, 'rb') as f:
            U = pickle.load(f)
        with open(fname_V, 'rb') as f:
            V = pickle.load(f)
        with open(fname_S, 'rb') as f:
            s = pickle.load(f)
        print "Using SVD from file"
    else:
        U, s, V = np.linalg.svd(train_data, full_matrices=True)
        with open(fname_U, 'wb') as f:
            pickle.dump(U, f)
        with open(fname_V, 'wb') as f:
            pickle.dump(V, f)
        with open(fname_S, 'wb') as f:
            pickle.dump(s, f)
        print "Using SVD by calculation"

    print("SVD decomposition done in %fs" % (time() - t0))
    square_sum_s = np.square(s).sum()
    #not sure if this is the most optimal way for finding the sum of squares

    temp_sum = 0
    count = 0
    for i in s:
        temp_sum += i * i
        count += 1
        if (temp_sum >= 0.9 * square_sum_s):
            break

    print "count = " + str(count)
    x = np.delete(V, np.s_[count::1], 0)
    processedV = np.transpose(x)
    train_X = np.dot(train_data, processedV)
    test_X = np.dot(test_data, processedV)

    # X = X_raw[0:k*input_size + 1, :]
    # test_X = X_raw[k*input_size+1:,:]

    # print "count = "+str(count)
    # print "V.shape = "+str(V.shape)
    # print "s.shape = "+str(s.shape)
    # x = np.delete(V, np.s_[count::1], 0)
    # print "x.shape = "+str(x.shape)
    # print "raw_train_data.shape = "+str(raw_train_data)
    # print "processedV.shape = "+str(processedV.shape)

    #can use splicing instead of delete

    # print "X.shape = "+str(X.shape)

    # train_results = stat.get_trainmatrix(input_size, read_database = read_database, to_print = to_print)

    mlb = MultiLabelBinarizer()
    trainingdata_results = mlb.fit_transform(raw_train_results)
    # print train_results
    train_Y = trainingdata_results[0:split_point, :]
    test_Y = trainingdata_results[split_point + 1:, :]

    # print train_Y
    # test_Y = mlb.fit_transform(test_results)
    # print test_results

    # print Y.shape
    # test_X = X[0:k*input_size,:]
    # print train_X
    # print train_Y
    # print train_results

    if (one_vs_one == 1):
        clf = OneVsOneClassifier(
            svm.LinearSVC(random_state=0, max_iter=10000, verbose=0))
        prediction_Y = clf.fit(X, Y).predict(X)
    else:
        if model == "LinearSVC":
            print "Showing Results for one vs rest multilabel classifier using LinearSVC model"
            clf = OneVsRestClassifier(
                svm.LinearSVC(random_state=0,
                              dual=True,
                              max_iter=max_iter,
                              verbose=0,
                              C=0.001,
                              loss="squared_hinge",
                              multi_class="crammer_singer"))

        elif model == "SVC":
            print "Showing Results for one vs rest multilabel classifier using SVC model"
            clf = OneVsRestClassifier(
                svm.SVC(C=0.001,
                        kernel='poly',
                        max_iter=max_iter,
                        verbose=0,
                        degree=3))
        clf.fit(train_X, train_Y)
        print clf.get_params
        scores = clf.decision_function(test_X)
        scores_train = clf.decision_function(train_X)

        indices = scores.argmax(axis=1)
        indices_train = scores_train.argmax(axis=1)

        prediction_Y = np.zeros(scores.shape)
        prediction_train = np.zeros(scores_train.shape)

        # print prediction_Y.shape
        for i in range(0, len(indices)):
            prediction_Y[i][indices[i]] = 1

        for i in range(0, len(indices_train)):
            prediction_train[i][indices_train[i]] = 1

    #class sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000
    #class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)

    prediction = mlb.inverse_transform(prediction_Y)
    # print prediction
    # for i in prediction:
    # 	print i
    # print "\n"
    # for i in test_results:
    # 	print i
    # print clf.decision_function(test_X)
    # # # # print Y
    # print test_Y
    print "Testing Error : "
    evaluate.accuracy_atleast_one_match(test_results, prediction)
    evaluate.accuracy_null_results(prediction)
    evaluate.accuracy_exact_match(test_results, prediction)
    evaluate.accuracy_multilabel(test_results, prediction)
    evaluate.precision_multilabel(test_results, prediction)
    evaluate.recall_multilabel(test_results, prediction)
    evaluate.hamming_loss_multilabel(test_results, prediction)

    # print train_results
    # print prediction
    # print prediction
    print "Training Error : "
    prediction = mlb.inverse_transform(prediction_train)
    # for i in prediction:
    # 	print i
    # print "\n"
    # for i in test_results:
    # 	print i
    # print clf.decision_function(test_X)
    # # # # print Y
    # print test_Y
    # print prediction_Y
    evaluate.accuracy_atleast_one_match(train_results, prediction)
    evaluate.accuracy_null_results(prediction)
    evaluate.accuracy_exact_match(train_results, prediction)
    evaluate.accuracy_multilabel(train_results, prediction)
    evaluate.precision_multilabel(train_results, prediction)
    evaluate.recall_multilabel(train_results, prediction)
    evaluate.hamming_loss_multilabel(train_results, prediction)

示例#8

0

显示文件

文件： 03_classification.py 项目： prucehuang/quickly-start-python

def multiclass_classification():
    ## 二分类的分类器 —— 线性分类器、SVM
    # 默认是SVM， 默认也是OvA, 训练是十个分类器，选择max的score
    sgd_clf = SGDClassifier(random_state=42, max_iter=100, tol=1e-3)
    sgd_clf.fit(X_train, y_train)
    print('OvA所有的类别：', sgd_clf.classes_)
    print(sgd_clf.predict([some_digit]), '每个分类的概率值：',
          sgd_clf.decision_function([some_digit]))

    # 强制设定为OVO，一共会生成n*(n-1)/2个分类器
    ovo_clf = OneVsOneClassifier(
        SGDClassifier(max_iter=100, random_state=42, tol=1e-3))
    ovo_clf.fit(X_train, y_train)
    print('OVO所有的类别：', ovo_clf.classes_, '分类器总数：', len(ovo_clf.estimators_))
    print(ovo_clf.predict([some_digit]), '每个分类的概率值：',
          ovo_clf.decision_function([some_digit]))

    ## 多分类的分类器 —— 随机森林、贝叶斯
    forest_clf = RandomForestClassifier(random_state=42, n_estimators=10)
    forest_clf.fit(X_train, y_train)
    print('随机森林预测：', forest_clf.predict([some_digit]), '每个分类的概率值：',
          forest_clf.predict_proba([some_digit]))

    # 稍微加上正规化处理一下特征我们的准确率就涨了
    print('特征处理之前的准确率：',
          cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy"))
    X_train_scaled = StandardScaler().fit_transform(X_train.astype(np.float64))
    print(
        '特征处理之后的准确率：',
        cross_val_score(sgd_clf,
                        X_train_scaled,
                        y_train,
                        cv=3,
                        scoring="accuracy"))

    # 多分类的交叉矩阵
    y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)
    print(conf_mx)
    # plt.matshow(conf_mx, cmap=plt.cm.gray)
    # plt.show()
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    np.fill_diagonal(norm_conf_mx, 0)  # 将主对角线都设置成0 预测出错的数据就被凸显了
    print(norm_conf_mx)
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    plt.show()
    # 将预测出错的数据单独拉出来分析
    cl_a, cl_b = 1, 8
    X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]  # TT
    X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]  # TF
    X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]  # FT
    X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]  # FF

    plt.figure(figsize=(8, 8))
    plt.subplot(221)
    plot_digits(X_bb[:25], images_per_row=5)
    plt.subplot(222)
    plot_digits(X_ba[:25], images_per_row=5)
    plt.subplot(223)
    plot_digits(X_ab[:25], images_per_row=5)
    plt.subplot(224)
    plot_digits(X_aa[:25], images_per_row=5)
    plt.show()

示例#9

0

显示文件

import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

from matplotlib import font_manager, rc

font_name = font_manager.FontProperties(
    fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False

iris = load_iris()
# 각특성을 1:1 로 비교
model_ovo = OneVsOneClassifier(LogisticRegression(solver='lbfgs')).fit(
    iris.data, iris.target)
print(model_ovo.decision_function(iris.data))  # 구분함수
ax1 = plt.subplot(211)  # [2,1] 형태의 창에 1번째
#
pd.DataFrame(model_ovo.decision_function(iris.data)).plot(ax=ax1, legend=True)
plt.title('판별함수')
ax2 = plt.subplot(212)
# 훈련 결과에 실데이터를 적용하여 판정
pd.DataFrame(model_ovo.predict(iris.data),
             columns=["prediction"]).plot(marker='o', ls='', ax=ax2)
plt.title("클래스 판별")
plt.tight_layout()
plt.show()

示例#10

0

显示文件

    plt.savefig("cf" + str(normalize) + ".png")


pipeline1 = Pipeline([
    ('vect', CountVectorizer(min_df=5, stop_words=text.ENGLISH_STOP_WORDS)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', get_svd()),
])

train_lsi, test_lsi = fetchLSIRepresentation(pipeline1, twenty_train,
                                             twenty_test)

ovo_svc = OneVsOneClassifier(svm.SVC(kernel='linear', probability=True,
                                     C=1000))

ovo_svc.fit(train_lsi, train_target_group)
predicted = ovo_svc.predict(test_lsi)
print_statistics(test_target_group, predicted)

predicted_probs = ovo_svc.decision_function(test_lsi)

#fpr, tpr, _ = roc_curve(test_target_group, predicted_probs)

#plot_roc(fpr, tpr, penalty)

cnf_matrix = smet.confusion_matrix(test_target_group, predicted)
plot_confusion_matrix(cnf_matrix,
                      classes=classCategories,
                      title='Multiclass Confusion matrix')
#plot_confusion_matrix(cnf_matrix, classes=classCategories, normalize=True, title='Confusion matrix with normalization' )

示例#11

0

显示文件

文件： classify_one_feature.py 项目： Aribido-Oluwaseun/voice_research

    def svmfit(self):
        data = self.all_features()

        shortfiles = [
            'v.wav.csv', 'Yeye.wav.csv', 'myohmy.wav.csv', 'mymymy.wav.csv',
            'mememe.wav.csv'
        ]
        total_length = 200
        trunc = int(total_length * 0.7)
        ver = total_length - trunc

        num_of_features = 2
        num_of_samples = len(shortfiles)
        np.random.seed(4)

        # save statistics
        stats = {}
        for i in range(len(self.header)):

            v = np.asarray([data['v.wav.csv'][self.header[i]][0:total_length]
                            ]).reshape((total_length, 1))
            np.nan_to_num(v)
            np.random.shuffle(v)
            yeye = np.asarray([
                data['Yeye.wav.csv'][self.header[i]][0:total_length]
            ]).reshape((total_length, 1))
            np.nan_to_num(yeye)
            np.random.shuffle(yeye)
            myohmy = np.asarray([
                data['myohmy.wav.csv'][self.header[i]][0:total_length]
            ]).reshape((total_length, 1))
            np.nan_to_num(myohmy)
            np.random.shuffle(myohmy)
            mymymy = np.asarray([
                data['mymymy.wav.csv'][self.header[i]][0:total_length]
            ]).reshape((total_length, 1))
            np.nan_to_num(mymymy)
            np.random.shuffle(mymymy)
            mememe = np.asarray([
                data['mememe.wav.csv'][self.header[i]][0:total_length]
            ]).reshape((total_length, 1))
            np.nan_to_num(mememe)
            np.random.shuffle(mememe)

            # Re-shuffle data
            v_tr = v[0:trunc, :]
            yeye_tr = yeye[0:trunc, :]
            myohmy_tr = myohmy[0:trunc, :]
            mymymy_tr = mymymy[0:trunc, :]
            mememe_tr = mememe[0:trunc, :]

            v_ts = v[trunc:total_length, :]
            yeye_ts = yeye[trunc:total_length]
            myohmy_ts = myohmy[trunc:total_length]
            mymymy_ts = mymymy[trunc:total_length]
            mememe_ts = mememe[trunc:total_length]

            X = np.zeros([trunc * num_of_samples, num_of_features])
            X_ts = np.zeros([ver * num_of_samples, num_of_features])

            y = np.zeros([len(X), 1])
            y_ts = np.zeros([len(X_ts), 1])

            selected_data = [v_tr, yeye_tr, myohmy_tr, mymymy_tr, mememe_tr]
            test_data = [v_ts, yeye_ts, myohmy_ts, mymymy_ts, mememe_ts]

            init_tr = 0
            init_ts = 0
            for j in range(len(selected_data)):
                X[init_tr:(j + 1) * trunc, 0:2] = selected_data[j]
                X_ts[init_ts:(j + 1) * ver, 0:2] = test_data[j]
                y[init_tr:(j + 1) * trunc, 0] = j
                y_ts[init_ts:(j + 1) * ver, 0] = j
                init_tr = trunc * (j + 1)
                init_ts = ver * (j + 1)

            # change y, y_ts back to 1-dimension
            y = y.reshape((len(X), ))
            y_ts = y_ts.reshape((len(X_ts, )))

            clf = OneVsOneClassifier(
                SVC(C=1,
                    cache_size=400,
                    coef0=0.0,
                    degree=5,
                    gamma='auto',
                    kernel='rbf',
                    max_iter=-1,
                    shrinking=True,
                    tol=.01,
                    verbose=False), -1).fit(X, y)
            pred = clf.predict(X_ts)
            dec_func = pd.DataFrame(
                OneVsOneClassifier.decision_function(clf, X_ts))
            rmse, corr = self.calculate_stats(y_ts, pred)
            accuracy = float(sum(y_ts == pred)) / len(y_ts)
            stats.update({self.header[i]: [accuracy, rmse, corr[0, 1]]})

            # print "accuracy: ", accuracy
            # print "rmse: ", rmse
            # print "corr: ", corr[0,1]
            #dec_func.plot()
            #plt.show()
        print stats

示例#12

0

显示文件

#       -864502.26667054, -245167.9063152 , -149510.01775103,
#       -233700.77221455]])

#argmax gives max values of scores
np.argmax(scores)

sgd.classes_
#array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U1')

from sklearn.multiclass import OneVsOneClassifier

ovo = OneVsOneClassifier(SGDClassifier(random_state=100))
ovo.fit(data_train, target_train)
ovo.predict([some_digit])
len(ovo.estimators_)
ovo.decision_function([some_digit])
#array([[ 1.5       ,  4.01086892,  0.50210079,  5.22484016,  8.31545536,
#         5.11411311, -0.43998285,  5.13308383,  7.3219439 ,  8.3175768 ]])

cross_val_score(sgd, data_train, target_train, cv=3, scoring='accuracy')
#array([0.86552689, 0.86179309, 0.86117918])

import pandas as pd

predict_m = cross_val_predict(sgd, data_train, target_train, cv=3)

ps = precision_score(target_train, predict_m, average=None)

rs = recall_score(target_train, predict_m, average=None)

df = pd.DataFrame({'precision': ps, 'recall': rs})

示例#13

0

显示文件

文件： abstract.py 项目： azrdev/sklearn-seco

class SeCoEstimator(BaseEstimator, ClassifierMixin):
    """A classifier using rules learned with the *Separate-and-Conquer* (SeCo)
    algorithm, also known as *Covering* algorithm.

    Wraps `_BaseSeCoEstimator` to handle multi-class problems, selecting a
    multi-class strategy and making sure that `_BaseSeCoEstimator` always sees
    an integer range [0..n_classes_) of class labels, where 0 is the intended
    fallback class; i.e. the biggest class in multi-class problems, or the
    negative class when learning a binary concept.

    The concrete SeCo variant to run is defined by `algorithm_config`.

    Fields
    -----
    algorithm_config : subclass of SeCoAlgorithmConfiguration
        Defines the concrete SeCo algorithm to run, see
        :class:`SeCoAlgorithmConfiguration`.

    Parameters
    -----
    multi_class : callable or str or None
        Which strategy to use for non-binary problems. Possible values:

        - None: auto-select; use 'direct' if possible
          (`algorithm_config.direct_multiclass_support()` returns True),
          'one_vs_rest' otherwise.
        - A callable: Construct
          `self.base_estimator_ = multi_class(_BaseSeCoEstimator())` and
          delegate to that estimator. Useful if you want to roll a different
          binarization strategy, e.g.

          >>> import sklearn.multiclass, functools
          >>> multi_class=functools.partial(
          ...     sklearn.multiclass.OutputCodeClassifier,
          ...     code_size=0.7, random_state=42)

          If you use this, make sure to pass to `_BaseSeCoEstimator` classes `y`
          from an integer range [0..n_classes_), e.g. using `LabelEncoder`.
          Also be aware of class order influence on tie-breaking.
        - 'direct': Directly learn a theory of rules with different heads
          (target classes). Uses :class:`BySizeLabelEncoder` internally.
        - 'one_vs_rest': Use `sklearn.multiclass.OneVsRestClassifier` for class
          binarization and learn binary theories.
        - 'one_vs_one': Use `sklearn.multiclass.OneVsOneClassifier` for class
          binarization and learn binary theories.
        - TODO: multi_class strategy of ripper: OneVsRest, remove C_i after learning rules for it

    random_state : None | int | instance of np.random.RandomState
        RNG, may be used by the algorithm. Value passed through
        `sklearn.utils.check_random_state`.

    n_jobs : int, optional
        Passed to `OneVsRestClassifier` or `OneVsOneClassifier` if these are
        used.

    Attributes
    -----
    base_estimator_ : estimator instance
        The estimator object that all tasks are delegated to. One of
        `sklearn.multiclass.OneVsRestClassifier`,
        `sklearn.multiclass.OneVsOneClassifier` or
        `sklearn_seco.util.TargetTransformingMetaEstimator` if demanded by the
        `multi_class_` strategy, a `_BaseSeCoEstimator` otherwise.

    multi_class_ : callable or str
        The actual strategy used on a non-binary problem. Relevant if
        `multi_class=None` demanded auto-selection.

    classes_ : np.ndarray
        `np.unique(y)`

    See Also
    -----
    `_BaseSeCoEstimator`
    """

    algorithm_config: Type[SeCoAlgorithmConfiguration]

    # TODO: _BaseSeCoEstimator.export_text equivalent inverting binarization & target transformation for display

    def _more_tags(self):
        # tell sklearn >= 0.21 that we can handle categorical data
        return {'X_types': ['2darray', 'categorical'], 'allow_nan': True}

    def __init__(self, multi_class=None, random_state=1, n_jobs=1):
        self.multi_class = multi_class
        self.random_state = random_state
        self.n_jobs = n_jobs

    def fit(self, X, y, **kwargs):
        """Learn SeCo theory/theories on training data `X, y`.

        For possible parameters (`**kwargs`), refer to
        :class:`_BaseSeCoEstimator`.
        """
        X, y = check_X_y(X, y, force_all_finite='allow-nan')
        self.multi_class_ = self.multi_class
        self.base_estimator_ = _BaseSeCoEstimator(
            self.algorithm_config, random_state=self.random_state, **kwargs)

        # NOTE: if using multiprocessing (e.g. through OvO or OvR), all
        #   sub-estimators share the same random seed/state.
        #   I think this should not harm.

        def wrapper_ordering_classes_by_size(estimator):
            # BySizeLabelEncoder ensures:  first class = default = biggest
            # and that classes form an integer range [0..n_classes_)
            return TargetTransformingMetaEstimator(BySizeLabelEncoder(),
                                                   estimator)

        self.classes_ = np.unique(y)
        n_classes_ = self.classes_.size
        if n_classes_ == 1:
            raise ValueError("SeCoEstimator requires 2 or more distinct "
                             "classes. Only 1 class (%s) present." %
                             self.classes_[0])
        elif n_classes_ == 2:
            self.base_estimator_ = wrapper_ordering_classes_by_size(
                self.base_estimator_)
        else:  # n_classes_ > 2
            if self.multi_class_ is None:
                # default / auto-selection
                if self.algorithm_config.direct_multiclass_support():
                    self.multi_class_ = "direct"
                else:
                    self.multi_class_ = "one_vs_rest"

            if callable(self.multi_class_):
                self.base_estimator_ = self.multi_class_(self.base_estimator_)
            elif self.multi_class_ == "one_vs_rest":
                self.base_estimator_ = OneVsRestClassifier(
                    self.base_estimator_, n_jobs=self.n_jobs)
            elif self.multi_class_ == "one_vs_one":
                self.base_estimator_ = OneVsOneClassifier(self.base_estimator_,
                                                          n_jobs=self.n_jobs)
            elif self.multi_class_ == "direct":
                # TODO: if self.multi_class=='direct' (not `None` auto-detect), only assertion prevents binary-only learner to silently learn on multiclass training data
                self.base_estimator_ = wrapper_ordering_classes_by_size(
                    self.base_estimator_)
            else:
                raise ValueError("Unknown multi-class mode %s" %
                                 self.multi_class_)

        # NOTE: param categorical_features is data dependent, but OvR/OvO don't
        #   pass extra parameters through fit(), so it has to be in
        #   `_BaseSeCoEstimator.__init__`.
        self.base_estimator_.fit(X, y)
        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_"])
        X = check_array(X, force_all_finite='allow-nan')
        return self.base_estimator_.predict(X)

    @if_delegate_has_method('base_estimator_')
    def predict_proba(self, X):
        # noinspection PyUnresolvedReferences
        return self.base_estimator_.predict_proba(X)

    @if_delegate_has_method('base_estimator_')
    def decision_function(self, X):
        # noinspection PyUnresolvedReferences
        return self.base_estimator_.decision_function(X)

    def get_seco_estimators(self) -> Sequence[_BaseSeCoEstimator]:
        """
        :return: The `_BaseSeCoEstimator` instances that were trained.
            Depending on the multi-class strategy, the class labels they use
            differ in order and value.
            Cannot be used when self.multi_class_ is a callable.
        """
        check_is_fitted(self, 'base_estimator_')
        is_binary = len(self.classes_) == 2
        if is_binary or self.multi_class_ == "direct":
            assert isinstance(self.base_estimator_,
                              TargetTransformingMetaEstimator)
            return [self.base_estimator_.estimator]
        elif self.multi_class_ == "one_vs_rest":
            assert isinstance(self.base_estimator_, OneVsRestClassifier)
            return self.base_estimator_.estimators_
        elif self.multi_class_ == "one_vs_one":
            assert isinstance(self.base_estimator_, OneVsOneClassifier)
            return self.base_estimator_.estimators_
        else:
            assert False, "invalid state: unknown type of base_estimator_ " \
                f"({str(self.base_estimator_)})"

示例#14

0

显示文件

class RobustWeightedClassifier(BaseEstimator, ClassifierMixin):
    """Algorithm for robust classification using reweighting algorithm.

    This model use iterative reweighting of samples to make a regression or
    classification estimator robust.

    The principle of the algorithm is to use an empirical risk minimization
    principle where the risk is estimated using a robust estimator (for example
    Huber estimator or median-of-means estimator)[1], [3]. The idea behind this
    algorithm was mentioned before in [2].
    This idea translates in an iterative algorithm where the sample_weight
    are changed at each iterations and are dependent of the sample. Informally
    the outliers should have small weight while the inliers should have big
    weight, where outliers are sample with a big loss function.

    This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily
    bad outliers). When the "mom" weighting scheme is used, k outliers can be
    tolerated. When the "Huber" weighting scheme is used, asymptotically the
    number of outliers has to be less than half the sample size.

    Read more in the :ref:`User Guide <robust>`.

    Parameters
    ----------

    weighting : string, default="huber"
        Weighting scheme used to make the estimator robust.
        Can be 'huber' for huber-type weights or  'mom' for median-of-means
        type weights.

    max_iter : int, default=100
        Maximum number of iterations.
        For more information, see the optimization scheme of base_estimator
        and the eta0 and burn_in parameter.

    burn_in : int, default=10
        Number of steps used without changing the learning rate.
        Can be useful to make the weight estimation better at the beginning.

    eta0 : float, default=0.01
        Constant step-size used during the burn_in period. Used only if
        burn_in>0. Can have a big effect on efficiency.

    c : float>0 or None, default=None
        Parameter used for Huber weighting procedure, used only if weightings
        is 'huber'. Measure the robustness of the weighting procedure. A small
        value of c means a more robust estimator.
        Can have a big effect on efficiency.
        If None, c is estimated at each step using half the Inter-quartile
        range, this tends to be conservative (robust).

    k : int < sample_size/2, default=1
        Parameter used for mom weighting procedure, used only if weightings
        is 'mom'. 2k+1 is the number of blocks used for median-of-means
        estimation, higher value of k means a more robust estimator.
        Can have a big effect on efficiency.
        If None, k is estimated using the number of points distant from the
        median of means of more than 2 times a robust estimate of the scale
        (using the inter-quartile range), this tends to be conservative
        (robust).

    loss : string, None or callable, default="log"
        Name of the loss used, must be the same loss as the one optimized in
        base_estimator.
        Classification losses supported : 'log', 'hinge'.
        If 'log', then the base_estimator must support predict_proba.
        Regression losses supported : 'squared_loss', .

    sgd_args : dict, default={}
        arguments of the SGDClassifier base estimator.

    multi_class : string, default="ovr"
        multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr"
        for OneVsRestClassifier or "binary" for binary classification.

    n_jobs : int, default=1
        number of jobs used in the multi-class meta-algorithm computation.

    tol : float or None, (default = 1e-3)
        The stopping criterion. If it is not None, training will stop when
        (loss > best_loss - tol) for n_iter_no_change consecutive epochs.

    n_iter_no_change : int, default=10
        Number of iterations with no improvement to wait before early stopping.

    random_state : int, RandomState instance or None, optional (default=None)
        The seed of the pseudo random number generator to use when shuffling
        the data. If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by np.random.



    Attributes
    ----------

    classes_ : ndarray of shape (n_classes, )
        A list of class labels known to the classifier.

    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
        Coefficient of the features in the decision function. Only available if
        multi_class = "binary"

    intercept_ : ndarray of shape (1,) or (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.
        Only available if multi_class = "binary"

    n_iter_ : ndarray of shape (n_classes,) or (1, )
        Actual number of iterations for all classes. If binary or multinomial,
        it returns only 1 element. For liblinear solver, only the maximum
        number of iteration across all classes is given.

    base_estimator_ : object,
        The fitted base estimator SGDCLassifier.

    weights_ : array like, length = n_sample.
        Weight of each sample at the end of the algorithm. Can be used as a
        measure of how much of an outlier a sample is. Only available if
        multi_class = "binary"


    Notes
    -----

    Often, there is a need to use RobustScaler as preprocessing.

    Examples
    --------

    >>> from sklearn_extra.robust import RobustWeightedClassifier
    >>> from sklearn.datasets import make_blobs
    >>> import numpy as np
    >>> rng = np.random.RandomState(42)
    >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]),
    ...                  random_state=rng)
    >>> clf=RobustWeightedClassifier()
    >>> _ = clf.fit(X, y)
    >>> score = np.mean(clf.predict(X)==y)

    References
    ----------

    [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu.
        "Robust classification via MOM minimization", Mach Learn 109, (2020).
        https://doi.org/10.1007/s10994-019-05863-6 (2018).
        arXiv:1808.03106

    [2] Christian Brownlees, Emilien Joly and Gábor Lugosi.
        "Empirical risk minimization for heavy-tailed losses", Ann. Statist.
        Volume 43, Number 6 (2015), 2507-2536.

    [3] Stanislav Minsker and Timothée Mathieu.
        "Excess risk bounds in robust empirical risk minimization"
        arXiv preprint (2019). arXiv:1910.07485.

    """

    def __init__(
        self,
        weighting="huber",
        max_iter=100,
        burn_in=10,
        eta0=0.01,
        c=None,
        k=0,
        loss="log",
        sgd_args=None,
        multi_class="ovr",
        n_jobs=1,
        tol=1e-3,
        n_iter_no_change=10,
        random_state=None,
    ):
        self.weighting = weighting
        self.max_iter = max_iter
        self.burn_in = burn_in
        self.eta0 = eta0
        self.c = c
        self.k = k
        self.loss = loss
        self.sgd_args = sgd_args
        self.multi_class = multi_class
        self.n_jobs = n_jobs
        self.tol = tol
        self.n_iter_no_change = n_iter_no_change
        self.random_state = random_state

    def fit(self, X, y):
        """Fit the model to data matrix X and target(s) y.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : returns an estimator trained with RobustWeightedClassifier.
        """

        if self.sgd_args is None:
            sgd_args = {}
        else:
            sgd_args = self.sgd_args

        # Define the base estimator
        base_robust_estimator_ = _RobustWeightedEstimator(
            SGDClassifier(**sgd_args, loss=self.loss),
            weighting=self.weighting,
            loss=self.loss,
            burn_in=self.burn_in,
            c=self.c,
            k=self.k,
            eta0=self.eta0,
            max_iter=self.max_iter,
            tol=self.tol,
            n_iter_no_change=self.n_iter_no_change,
            random_state=self.random_state,
        )

        if self.multi_class == "ovr":
            self.base_estimator_ = OneVsRestClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        elif self.multi_class == "binary":
            self.base_estimator_ = base_robust_estimator_
        elif self.multi_class == "ovo":
            self.base_estimator_ = OneVsOneClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        else:
            raise ValueError("No such multiclass method implemented.")

        self.base_estimator_.fit(X, y)
        if self.multi_class == "binary":
            self.weights_ = self.base_estimator_.weights_
            self.coef_ = self.base_estimator_.coef_
            self.intercept_ = self.base_estimator_.intercept_
        self.n_iter_ = self.max_iter * len(X)
        self.classes_ = self.base_estimator_.classes_
        return self

    def predict(self, X):
        """Predict using the estimator trained with RobustWeightedClassifier.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y : array-like, shape (n_samples, n_outputs)
            The predicted values.
        """

        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.predict(X)

    def _check_proba(self):
        if self.loss != "log":
            raise AttributeError(
                "Probability estimates are not available for"
                " loss=%r" % self.loss
            )

    @property
    def predict_proba(self):
        """
        Probability estimates when binary classification.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Vector to be scored, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        Returns
        -------
        T : array-like of shape (n_samples, n_classes)
            Returns the probability of the sample for each class in the model,
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        self._check_proba()
        return self._predict_proba

    def _predict_proba(self, X):
        return self.base_estimator_.predict_proba(X)

    @property
    def _estimator_type(self):
        return self.base_estimator._estimator_type

    def score(self, X, y=None):
        """Returns the score on the given data, using
        ``base_estimator_.score``.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) or (n_samples,), optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        Returns
        -------
        score : float
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.score(X, y)

    def decision_function(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)

        Returns
        -------
        array, shape (n_samples,)
           Predicted target values per element in X.
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.decision_function(X)

示例#15

0

显示文件

class LSVMDetector:
    # just the training() function changes, rest all remains same.

    def __init__(self, subjects, data, attacker_data):
        self.data = data
        self.attacker = attacker_data
        self.u_scores = []
        self.i_scores = []
        self.mean_vector = []
        self.subjects = subjects
        self.fp = []

    def training(self):
        self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto'))
        labels = [0] * len(self.train) + [1] * len(self.train_imposter)
        self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels)

    def testing(self):
        self.u_scores = self.clf.decision_function(self.test_genuine)
        self.i_scores = self.clf.decision_function(self.test_imposter)
        self.u_scores = list(self.u_scores)
        self.i_scores = list(self.i_scores)

    def evaluate(self):
        eers = []
        fpr = []

        if isinstance(self.subjects, list):
            for idx, subject in enumerate(self.subjects):
                genuine_user_data = self.data.loc[self.data.user_id == subject, \
                                                  ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                   'length of trajectory', 'mid-stroke pressure',
                                                   'mid-stroke area covered',
                                                   '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                   '20\%-perc. dev. from end-to-end line',
                                                   '50\%-perc. dev. from end-to-end line',
                                                   '80\%-perc. dev. from end-to-end line']]
                imposter_data = self.data.loc[self.data.user_id != subject, :]
                # generated_data = attacker_data
                genuine_user_data = normalize_df(genuine_user_data[:400])

                self.train = genuine_user_data[:200]
                self.test_genuine = genuine_user_data[200:400]

                # self.test_imposter = normalize_np(self.attacker[idx])
                # self.test_imposter = normalize_df(imposter_data.groupby("user_id"). \
                #                                    head(10).loc[:,
                #                                    ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                #                                     'length of trajectory', 'mid-stroke pressure',
                #                                     'mid-stroke area covered',
                #                                     '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                #                                     '20\%-perc. dev. from end-to-end line',
                #                                     '50\%-perc. dev. from end-to-end line',
                #                                     '80\%-perc. dev. from end-to-end line']])
                self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \
                                         tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                         'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered',
                                                         '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                         '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line',
                                                         '80\%-perc. dev. from end-to-end line']])

                self.test_imposter = self.attacker[idx]

                self.training()
                self.testing()
                # eers.append(evaluateEER(self.u_scores, \
                #                         self.i_scores))
                fpr.append(evaluateFAR(self.u_scores, self.i_scores))
                # print(evaluateFAR(self.u_scores, self.i_scores))

        else:
            genuine_user_data = self.data.loc[self.data.user_id == self.subjects, \
                                              ["stroke duration", 'start $x$', 'start $y$', 'stop $x$',
                                               'stop $y$',
                                               'length of trajectory', 'mid-stroke pressure',
                                               'mid-stroke area covered',
                                               '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                               '20\%-perc. dev. from end-to-end line',
                                               '50\%-perc. dev. from end-to-end line',
                                               '80\%-perc. dev. from end-to-end line']]
            imposter_data = self.data.loc[
                self.data.user_id != self.subjects, :]
            # generated_data = attacker_data
            genuine_user_data = normalize_df(genuine_user_data[:400])

            self.train = genuine_user_data[:200]
            self.test_genuine = genuine_user_data[200:400]
            # self.test_imposter = imposter_data.groupby("subject"). \
            #                          tail(6).loc[:, "H.period":"H.Return"]
            # self.test_imposter = normalize_np(self.attacker)
            self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \
                                         tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                         'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered',
                                                         '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                         '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line',
                                                         '80\%-perc. dev. from end-to-end line']])
            self.test_imposter = self.attacker

            self.training()
            self.testing()
            # eers.append(evaluateEER(self.u_scores, \
            #                        self.i_scores))
            fpr.append(evaluateFAR(self.u_scores, self.i_scores))

        return np.mean(fpr)