示例#1
0
def plot_raw_data(df, NUM_SAMPLES_PER_CLASS):
    userids = create_userids(df)
    NUM_USERS = len(userids)
    for i in range(0, NUM_USERS):
        userid = userids[i]
        print(userid)
        user_data = df.loc[df.iloc[:, -1].isin([userid])]
        # Select data for training
        user_data = user_data.drop(user_data.columns[-1], axis=1)
        user_array = user_data.values[0:NUM_SAMPLES_PER_CLASS, :]
        rows, cols = user_array.shape
        plt.clf()
        plt.xlabel('Time')
        plt.title("User " + str(userids[i]))
        for row in range(rows):
            plt.plot(user_array[row, :])
        output_file = str(userids[i]) + '.png'
        print(output_file)
        plt.savefig(st.OUTPUT_FIGURES + "/" + output_file)
def normalize_users_columns(df, norm_type):
    print(df.shape)
    userids = create_userids(df)
    user_data = df.loc[df.iloc[:, -1].isin([userids[0]])]
    user_array = user_data.values
    nsamples, nfeatures = user_array.shape
    nfeatures = nfeatures - 1
    user_X = user_array[:, 0:nfeatures]
    user_y = user_array[:, -1]

    scaler = MinMaxScaler()
    print(userids[0] + ": " + str(user_X.shape))
    if (norm_type == st.NormalizationType.MINMAX):
        user_X = scaler.fit_transform(user_X)
    if (norm_type == st.NormalizationType.ZSCORE):
        user_X = preprocessing.scale(user_X)
    X = user_X
    y = user_y

    NUM_USERS = len(userids)
    for i in range(1, NUM_USERS):
        userid = userids[i]
        user_data = df.loc[df.iloc[:, -1].isin([userid])]
        user_array = user_data.values
        nsamples, nfeatures = user_array.shape
        nfeatures = nfeatures - 1
        user_X = user_array[:, 0:nfeatures]
        user_y = user_array[:, -1]

        if (norm_type == st.NormalizationType.MINMAX):
            user_X = scaler.fit_transform(user_X)
        if (norm_type == st.NormalizationType.ZSCORE):
            user_X = preprocessing.scale(user_X)
        # append data
        X = np.vstack([X, user_X])
        y = np.concatenate([y, user_y])
    df = pd.DataFrame(X)
    df['user'] = y
    return df
def plot_user_dx_dy_histo(df):
    set_style()
    userids = create_userids(df)
    NUM_USERS = len(userids)
    for i in range(0, NUM_USERS):
        userid = userids[i]
        print(userid)
        user_data = df.loc[df.iloc[:, -1].isin([userid])]
        # Select data for training
        user_data = user_data.drop(user_data.columns[-1], axis=1)

        user_dx = user_data[user_data.columns[range(0, 128)]]
        user_dy = user_data[user_data.columns[range(128, 256)]]

        plt.clf()
        result = []
        [result.extend(el) for el in user_dx.values.tolist()]
        sns.distplot(result, norm_hist=True, color='green', bins=32)

        # n, bins, patches = plt.hist( result, 50 )
        plt.xlabel('Bins')
        plt.ylabel('Density')
        plt.title(' dx histogram ')
        output_file = str(userids[i]) + '_dx.png'
        print(output_file)
        plt.savefig(st.OUTPUT_FIGURES + "/" + output_file)

        plt.clf()
        result = []
        [result.extend(el) for el in user_dy.values.tolist()]
        ax = sns.distplot(result, norm_hist=True, color='red', bins=32)
        print(ax)
        # plt.hist( result )
        plt.xlabel('Bins')
        plt.ylabel('Density')
        plt.title(' dy histogram ')
        output_file = str(userids[i]) + '_dy.png'
        # print(output_file)
        plt.savefig(st.OUTPUT_FIGURES + "/" + output_file)
示例#4
0
def evaluate_authentication( df, verbose = False):
    print(df.shape)
    userids = create_userids( df )
    NUM_USERS = len(userids)
    auc_list = list()
    eer_list = list()
    global_positive_scores = list()
    global_negative_scores = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]
        user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ]
        # Select data for training
        user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1)
        user_array = user_train_data.values
 
        num_samples = user_array.shape[0]
        train_samples = (int)(num_samples * 0.66)
        test_samples = num_samples - train_samples
        # print("#train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples))
        user_train = user_array[0:train_samples,:]
        user_test = user_array[train_samples:num_samples,:]
     
        other_users_data = df.loc[~df.iloc[:, -1].isin([userid])]
        other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1)
        other_users_array = other_users_data.values   
        
        clf = OneClassSVM(gamma='scale')
        clf.fit(user_train)
 
        positive_scores = clf.score_samples(user_test)
        negative_scores =  clf.score_samples(other_users_array)   
        
        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        auc, eer = compute_AUC_EER(y_pred_positive, y_pred_negative)
        # auc, eer = compute_AUC_EER(positive_scores, negative_scores )

        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        if  verbose == True:
            print(str(userid)+", "+ str(auc)+", "+str(eer) )
         
        auc_list.append(auc)
        eer_list.append(eer)
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )
    
    if verbose == True:
        global_auc, global_eer = compute_AUC_EER(global_positive_scores, global_negative_scores)
        print("Global AUC: "+str(global_auc))
        print("Global EER: "+str(global_eer))
    return auc_list, eer_list
示例#5
0
def evaluate_authentication_cross_day( df1, df2, verbose = False ):
    print("Session 1 shape: "+str(df1.shape))
    print("Session 2 shape: "+str(df2.shape))
        
    userids = create_userids( df1 )
    NUM_USERS = len(userids)
    
    global_positive_scores = list()
    global_negative_scores = list()
    auc_list = list()
    eer_list = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]

        user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])]
        user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])]
      
        user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1)
        user_session1_array = user_session1_data.values
 
        # positive test data
        user_session2_data =  user_session2_data.drop(user_session2_data.columns[-1], axis=1) 
        user_session2_array = user_session2_data.values

        # negative test data
        other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])]
        other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1)
        other_users_session2_array = other_users_session2_data.values   
        
        clf = OneClassSVM(gamma='scale')
        clf.fit(user_session1_array)
 
        positive_scores = clf.score_samples(user_session2_array)
        negative_scores =  clf.score_samples(other_users_session2_array)   

        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        auc, eer = compute_AUC_EER(y_pred_positive, y_pred_negative)

        
        # auc, eer = compute_AUC_EER(positive_scores, negative_scores )
 
        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        if verbose == True:
            print(str(userid)+": "+ str(auc)+", "+str(eer) )
        auc_list.append(auc)
        eer_list.append(eer)
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )

    if verbose == True:
        global_auc, global_eer = compute_AUC_EER(global_positive_scores, global_negative_scores)
        print("Global AUC: "+str(global_auc))
        print("Global EER: "+str(global_eer))
    return auc_list, eer_list
示例#6
0
def evaluate_authentication_train_test(df_train,
                                       df_test,
                                       data_type,
                                       num_blocks,
                                       representation_type,
                                       verbose=False,
                                       roc_data=False,
                                       roc_data_filename=TEMP_NAME):
    print("Training: " + str(df_train.shape))
    print("Testing: " + str(df_test.shape))
    userids = create_userids(df_train)
    NUM_USERS = len(userids)
    auc_list = list()
    eer_list = list()
    global_positive_scores = list()
    global_negative_scores = list()
    for i in range(0, NUM_USERS):
        userid = userids[i]
        user_train_data = df_train.loc[df_train.iloc[:, -1].isin([userid])]
        # Select data for training
        user_train_data = user_train_data.drop(user_train_data.columns[-1],
                                               axis=1)
        user_array = user_train_data.values
        # train_samples = user_array.shape[0]

        user_test_data = df_test.loc[df_test.iloc[:, -1].isin([userid])]
        user_test_data = user_test_data.drop(user_test_data.columns[-1],
                                             axis=1)
        # test_samples = user_test_data.shape[0]

        other_users_data = df_test.loc[~df_test.iloc[:, -1].isin([userid])]
        other_users_data = other_users_data.drop(other_users_data.columns[-1],
                                                 axis=1)
        # other_users_array = other_users_data.values

        # if (verbose == True):
        # print(str(userid)+". #train_samples: "+str(train_samples)+"\t#positive test_samples: "+ str(test_samples))

        clf = OneClassSVM(gamma='scale')
        clf.fit(user_train_data)

        positive_scores = clf.score_samples(user_test_data)
        negative_scores = clf.score_samples(other_users_data)

        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - num_blocks + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i:i + num_blocks],
                                            axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - num_blocks + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i:i + num_blocks],
                                            axis=0)

        auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative)

        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(
                positive_scores, negative_scores)

        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        if verbose == True:
            print(str(userid) + ", " + str(auc) + ", " + str(eer))

        auc_list.append(auc)
        eer_list.append(eer)
    print("\nNumber of blocks: ", num_blocks)
    print('AUC  mean : %7.4f, std: %7.4f' %
          (np.mean(auc_list), np.std(auc_list)))
    print('EER  mean:  %7.4f, std: %7.4f' %
          (np.mean(eer_list), np.std(eer_list)))

    print("#positives: " + str(len(global_positive_scores)))
    print("#negatives: " + str(len(global_negative_scores)))

    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores,
                                                       global_negative_scores)

    filename = 'output_png/scores_' + str(data_type.value) + '_' + str(
        representation_type.value)
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores,
                    global_negative_scores,
                    filename,
                    title='Scores distribution')
        # ****************************************************************************************

    if (roc_data == True):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict)
        df.to_csv(roc_data_filename, index=False)

    print(data_type.value + " Global AUC: " + str(global_auc))
    print(data_type.value + " Global EER: " + str(global_eer))
    return auc_list, eer_list
示例#7
0
def evaluate_dataset(current_dataset, dataset_amount, num_actions,
                     num_training_actions):
    #filename = FEAT_DIR + '/' + datasetname(current_dataset, dataset_amount, num_training_actions)
    filename1 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/lee_log"
    filename2 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/liu_log"
    """
    filename1 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/mouse_log"
    filename2 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/liu_log"
    """

    #print(filename1)
    #print(filename2)
    #dataset = pd.read_csv(filename)
    dataset = get_user_data(filename1, filename2)
    #print(dataset.shape)

    # DataFrame
    df = pd.DataFrame(dataset)

    num_features = int(dataset.shape[1])
    #print("Num features: ", num_features)
    array = dataset.values

    X = array[:, 0:num_features - 1]
    y = array[:, num_features - 1]

    userids = create_userids(current_dataset)
    userids = [1]

    #print(userids)

    # Train user-specific classifiers and evaluate them
    items = userids

    # fpr = {} <==> fpr = dict()
    fpr = {}
    tpr = {}
    roc_auc = {}

    correct = df.loc[df.iloc[:, -1].isin([1])]
    wrong = df.loc[df.iloc[:, -1].isin([2])]
    numSamples = min(correct.shape[0], wrong.shape[0])

    for i in userids:
        # print("Training classifier for the user "+str(i))
        # Select all positive samples that belong to current user
        user_positive_data = df.loc[df.iloc[:, -1].isin([i])]

        user_positive_data = user_positive_data.iloc[np.random.choice(
            user_positive_data.shape[0], numSamples)]
        #numSamples = user_positive_data.shape[0]
        array_positive = copy.deepcopy(user_positive_data.values)
        array_positive[:, -1] = 1

        # negative data for the current user
        user_neagtive_data = select_negatives_from_other_users(
            dataset, i, numSamples)
        array_negative = copy.deepcopy(user_neagtive_data.values)
        array_negative[:, -1] = 0

        # concatenate negative and positive data
        dataset_user = pd.concat(
            [pd.DataFrame(array_positive),
             pd.DataFrame(array_negative)]).values
        X = dataset_user[:, 0:-1]
        y = dataset_user[:, -1]

        if CURRENT_SPLIT_TYPE == SPLIT_TYPE.RANDOM:
            X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
                X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
            print("random split")
        else:
            X_train, X_validation, y_train, y_validation = keeporder_split(
                X, y, test_size=TEST_SIZE)
            #print ("f**k split")

        model = RandomForestClassifier(random_state=RANDOM_STATE)
        model.fit(X_train, y_train)

        # scoring = ['accuracy', 'roc_auc' ]
        # scores = cross_validate(model, X_train, y_train, scoring=scoring, cv = 10, return_train_score = False)
        scores = cross_validate(model,
                                X_train,
                                y_train,
                                cv=25,
                                return_train_score=False)
        cv_accuracy = scores['test_score']
        print("CV Accuracy: %0.2f (+/- %0.2f)" %
              (cv_accuracy.mean(), cv_accuracy.std() * 2))

        print("validation shape ", X_validation.shape)
        y_predicted = model.predict(X_validation)
        test_accuracy = accuracy_score(y_validation, y_predicted)
        print("Test Accuracy: %0.2f, y_predicted[0]" % test_accuracy,
              y_predicted[0])

        # save model
        with open('outmodel.pkl', 'wb') as f:
            pickle.dump(model, f)

        fpr[i], tpr[i], thr = evaluate_sequence_of_samples(
            model, X_validation, y_validation, num_actions)

        threshold = -1
        try:
            eer = brentq(lambda x: 1. - x - interp1d(fpr[i], tpr[i])(x), 0.,
                         1.)
            threshold = interp1d(fpr[i], thr)(eer)
        except (ZeroDivisionError, ValueError):
            print("Division by zero")

        roc_auc[i] = auc(fpr[i], tpr[i])
        print(
            str(i) + ": " + str(roc_auc[i]) + " threshold: " + str(threshold))
示例#8
0
def train_model(df,
                model_name="foo.h5",
                fcn_filters=128,
                representation_learning=False):
    userids = create_userids(df)
    # print(userids)
    nbclasses = len(userids)
    print('number of classes: ' + str(nbclasses))
    array = df.values
    nsamples, nfeatures = array.shape
    nfeatures = nfeatures - 1
    X = array[:, 0:nfeatures]
    y = array[:, -1]

    enc = OneHotEncoder()
    enc.fit(y.reshape(-1, 1))
    y = enc.transform(y.reshape(-1, 1)).toarray()
    X = X.reshape(-1, stt.FEATURES, stt.DIMENSIONS)

    if (representation_learning == False):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=stt.RANDOM_STATE)
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.25, random_state=stt.RANDOM_STATE)
    else:
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=stt.RANDOM_STATE)

    print("Train, validation (and test shapes): ")
    print(X_train.shape)
    print(X_val.shape)
    if (representation_learning == False):
        print(X_test.shape)

    mini_batch_size = int(min(X_train.shape[0] / 10, stt.BATCH_SIZE))
    if (model_name == "foo.h5"):
        model_name = stt.MODEL_NAME
    filepath = stt.TRAINED_MODELS_PATH + "/" + model_name

    print(filepath)
    cb, model = build_fcn((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath,
                          fcn_filters)
    # model.summary()

    X_train = np.asarray(X_train).astype(np.float32)
    X_val = np.asarray(X_val).astype(np.float32)

    # convert to tensorflow dataset
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))

    BATCH_SIZE = mini_batch_size
    SHUFFLE_BUFFER_SIZE = 100

    train_ds = train_ds.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
    val_ds = val_ds.batch(BATCH_SIZE)

    start_time = time.time()
    hist = model.fit(train_ds,
                     epochs=stt.EPOCHS,
                     verbose=True,
                     validation_data=val_ds,
                     callbacks=cb)

    hist_df = pd.DataFrame(hist.history)

    # plot training curve
    plot_training(hist, model_name, metrics='loss')
    plot_training(hist, model_name, metrics='accuracy')

    duration = time.time() - start_time
    print("Training duration: " + str(duration / 60))

    if (representation_learning == False):
        X_test = np.asarray(X_test).astype(np.float32)
        y_true = np.argmax(y_test, axis=1)
        y_pred = np.argmax(model.predict(X_test), axis=1)
        accuracy = metrics.accuracy_score(y_true, y_pred)
        print("Test accuracy: " + str(accuracy))
    return model
示例#9
0
def train_model(df, model_name="foo.h5"):
    userids = create_userids(df)
    nbclasses = len(userids)
    print(nbclasses)
    array = df.values
    nsamples, nfeatures = array.shape
    nfeatures = nfeatures - 1
    X = array[:, 0:nfeatures]
    y = array[:, -1]

    enc = OneHotEncoder()
    enc.fit(y.reshape(-1, 1))
    y = enc.transform(y.reshape(-1, 1)).toarray()
    X = X.reshape(-1, stt.FEATURES, stt.DIMENSIONS)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=stt.RANDOM_STATE)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.25, random_state=stt.RANDOM_STATE)

    print(X_train.shape)
    print(X_test.shape)
    print(X_val.shape)

    mini_batch_size = int(min(X_train.shape[0] / 10, stt.BATCH_SIZE))
    if (model_name == "foo.h5"):
        model_name = stt.MODEL_NAME
    filepath = stt.TRAINED_MODELS_PATH + "/" + model_name

    if (stt.MODEL_TYPE == stt.ModelType.FCN):
        cb, model = build_fcn((stt.FEATURES, stt.DIMENSIONS), nbclasses,
                              filepath)
    if (stt.MODEL_TYPE == stt.ModelType.RESNET):
        cb, model = build_resnet((stt.FEATURES, stt.DIMENSIONS), nbclasses,
                                 filepath)
    if (stt.MODEL_TYPE == stt.ModelType.MLP):
        cb, model = build_mlp((stt.FEATURES, stt.DIMENSIONS), nbclasses,
                              filepath)
    if (stt.MODEL_TYPE == stt.ModelType.MCDCNN):
        cb, model = build_mcdcnn((stt.FEATURES, stt.DIMENSIONS), nbclasses,
                                 filepath)
    if (stt.MODEL_TYPE == stt.ModelType.TLENET):
        cb, model = build_tlenet((stt.FEATURES, stt.DIMENSIONS), nbclasses,
                                 filepath)
    if (stt.MODEL_TYPE == stt.ModelType.CNN):
        cb, model = build_cnn((stt.FEATURES, stt.DIMENSIONS), nbclasses,
                              filepath)

    # if stt.UPDATE_WEIGHTS == True:
    #     model = set_weights_from_pretrained_model(model)

    X_train = np.asarray(X_train).astype(np.float32)
    X_val = np.asarray(X_val).astype(np.float32)

    # convert to tensorflow dataset
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))

    BATCH_SIZE = mini_batch_size
    SHUFFLE_BUFFER_SIZE = 100

    train_ds = train_ds.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
    val_ds = val_ds.batch(BATCH_SIZE)

    start_time = time.time()
    hist = model.fit(train_ds,
                     epochs=stt.EPOCHS,
                     verbose=True,
                     validation_data=val_ds,
                     callbacks=cb)

    hist_df = pd.DataFrame(hist.history)

    # save history to csv:
    hist_csv_file = 'histories/history.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
    duration = time.time() - start_time
    print("Training duration: " + str(duration / 60))

    # EVALUATION
    X_test = np.asarray(X_test).astype(np.float32)
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(model.predict(X_test), axis=1)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    print(accuracy)
    return model
def evaluate_dataset( current_dataset, dataset_amount, num_actions, num_training_actions):
    filename = FEAT_DIR + '/' + datasetname(current_dataset, dataset_amount, num_training_actions)

    print(filename)
    dataset = pd.read_csv(filename)
    print(dataset.shape)

    # DataFrame
    df = pd.DataFrame(dataset)

    num_features = int(dataset.shape[1])
    print("Num features: ", num_features)
    array = dataset.values

    X = array[:, 0:num_features - 1]
    y = array[:, num_features - 1]

    userids = create_userids(current_dataset)

    print(userids)


    # Train user-specific classifiers and evaluate them
    items = userids

    # fpr = {} <==> fpr = dict()
    fpr = {}
    tpr = {}
    roc_auc = {}


    for i in userids:
        # print("Training classifier for the user "+str(i))
        # Select all positive samples that belong to current user
        user_positive_data = df.loc[df.iloc[:, -1].isin([i])]

        numSamples = user_positive_data.shape[0]
        array_positive = copy.deepcopy(user_positive_data.values)
        array_positive[:, -1] = 1

        # negative data for the current user
        user_neagtive_data = select_negatives_from_other_users(dataset, i, numSamples)
        array_negative = copy.deepcopy(user_neagtive_data.values)
        array_negative[:, -1] = 0

        # concatenate negative and positive data
        dataset_user = pd.concat([pd.DataFrame(array_positive), pd.DataFrame(array_negative)]).values
        X = dataset_user[:, 0:-1]
        y = dataset_user[:, -1]

        if CURRENT_SPLIT_TYPE == SPLIT_TYPE.RANDOM:
            X_train, X_validation, y_train, y_validation = model_selection.train_test_split(X, y, test_size=TEST_SIZE,random_state= RANDOM_STATE)
        else:
            X_train, X_validation, y_train, y_validation = keeporder_split(X, y, test_size=TEST_SIZE)

        model = RandomForestClassifier(random_state= RANDOM_STATE)
        model.fit(X_train, y_train)

        # scoring = ['accuracy', 'roc_auc' ]
        # scores = cross_validate(model, X_train, y_train, scoring=scoring, cv = 10, return_train_score = False)
        scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=False)
        cv_accuracy = scores['test_score']
        print("CV Accuracy: %0.2f (+/- %0.2f)" % (cv_accuracy.mean(), cv_accuracy.std() * 2))

        y_predicted = model.predict(X_validation)
        test_accuracy = accuracy_score(y_validation, y_predicted)
        print("Test Accuracy: %0.2f" % test_accuracy)

        fpr[i], tpr[i], thr = evaluate_sequence_of_samples(model, X_validation, y_validation, num_actions)

        threshold = -1
        try:
            eer = brentq(lambda x: 1. - x - interp1d(fpr[i], tpr[i])(x), 0., 1.)
            threshold = interp1d(fpr[i], thr)(eer)
        except (ZeroDivisionError, ValueError):
            print("Division by zero")

        roc_auc[i] = auc(fpr[i], tpr[i])
        print(str(i) + ": " + str(roc_auc[i])+" threshold: "+str(threshold))

    plotROCs(fpr, tpr, roc_auc, items)
def evaluate_authentication( df, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME):
    print(df.shape)
    userids = create_userids( df )
    NUM_USERS = len(userids)
    auc_list = list()
    eer_list = list()
    global_positive_scores = list()
    global_negative_scores = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]
        user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ]
        # Select data for training
        user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1)
        user_array = user_train_data.values
 
        num_samples = user_array.shape[0]
        train_samples = (int)(num_samples * 0.66) + 1
        test_samples = num_samples - train_samples
        if (verbose == True):
            print(str(userid)+". #train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples))
        user_train = user_array[0:train_samples,:]
        user_test = user_array[train_samples:num_samples,:]
     
        other_users_data = df.loc[~df.iloc[:, -1].isin([userid])]
        other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1)
        other_users_array = other_users_data.values   
        
        clf = OneClassSVM(gamma='scale')
        clf.fit(user_train)
 
        positive_scores = clf.score_samples(user_test)
        negative_scores =  clf.score_samples(other_users_array)   
        
        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        auc, eer,_,_ = compute_AUC_EER(y_pred_positive, y_pred_negative)
        
        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(positive_scores, negative_scores)
        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        if  verbose == True:
            print(str(userid)+", "+ str(auc)+", "+str(eer)+"\n" )
         
        auc_list.append(auc)
        eer_list.append(eer) 
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )
    
    print("#positives: "+str(len(global_positive_scores)))
    print("#negatives: "+str(len(global_negative_scores)))

    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores)
    
    filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) 
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution')
        # ****************************************************************************************

    if( roc_data == True ):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict) 
        df.to_csv(roc_data_filename, index=False)

        words = roc_data_filename.split('/')
        auc_eer_data_filename = words[0] +'/auc_eer_' + words[ 1 ]
        dict = {'AUC': auc_list, 'EER': eer_list}
        df = pd.DataFrame(dict) 
        df.to_csv(auc_eer_data_filename, index=False)
        
    print("Global AUC: "+str(global_auc))
    print("Global EER: "+str(global_eer))
    return auc_list, eer_list
def evaluate_authentication_cross_day( df1, df2, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME ):
    print("Session 1 shape: "+str(df1.shape))
    print("Session 2 shape: "+str(df2.shape))
        
    userids = create_userids( df1 )
    NUM_USERS = len(userids)
    
    global_positive_scores = list()
    global_negative_scores = list()
    auc_list = list()
    eer_list = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]

        user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])]
        user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])]
      
        user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1)
        user_session1_array = user_session1_data.values
 
        # positive test data
        user_session2_data =  user_session2_data.drop(user_session2_data.columns[-1], axis=1) 
        user_session2_array = user_session2_data.values

        # negative test data
        other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])]
        other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1)
        other_users_session2_array = other_users_session2_data.values   
        
        clf = OneClassSVM(gamma='scale')
        clf.fit(user_session1_array)
 
        positive_scores = clf.score_samples(user_session2_array)
        negative_scores =  clf.score_samples(other_users_session2_array)   

        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative)

        
        # auc, eer = compute_AUC_EER(positive_scores, negative_scores )
        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(positive_scores, negative_scores)

        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        

        if verbose == True:
            print(str(userid)+": "+ str(auc)+", "+str(eer) )
        auc_list.append(auc)
        eer_list.append(eer)
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )

    
    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores)
    
    filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) 
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution')
        # ****************************************************************************************

    if( roc_data == True ):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict) 
        df.to_csv(roc_data_filename, index=False)

    print("Global AUC: "+str(global_auc))
    print("Global EER: "+str(global_eer))
    return auc_list, eer_list
def evaluate_authentication_skilledforgeries( df_genuine, df_forgery, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME):
    print("Genuine shape: "+str(df_genuine.shape))
    print("Forgery shape: "+str(df_forgery.shape))
    print(df_forgery.shape)
    userids = create_userids( df_genuine )
    NUM_USERS = len(userids)
    
    global_positive_scores = list()
    global_negative_scores = list()
    auc_list = list()
    eer_list = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]
        user_genuine_data = df_genuine.loc[df_genuine.iloc[:, -1].isin([userid])]
        user_forgery_data = df_forgery.loc[df_forgery.iloc[:, -1].isin([userid])]
      
        user_genuine_data = user_genuine_data.drop(user_genuine_data.columns[-1], axis=1)
        user_genuine_array = user_genuine_data.values
 
        num_samples = user_genuine_array.shape[0]
        train_samples = (int)(num_samples * 0.66)
        test_samples = num_samples - train_samples
        # MCYT
        # train_samples = 15
        # test_samples = 10

        user_genuine_train = user_genuine_array[0:train_samples,:]
        user_genuine_test = user_genuine_array[train_samples:num_samples,:]
     
        user_forgery_data =  user_forgery_data.drop(user_forgery_data.columns[-1], axis=1) 
        user_forgery_array = user_forgery_data.values

        clf = OneClassSVM(gamma='scale')
        clf.fit(user_genuine_train)
 
        positive_scores = clf.score_samples(user_genuine_test)
        negative_scores =  clf.score_samples(user_forgery_array)   
        auc, eer,_,_ = compute_AUC_EER(positive_scores, negative_scores )
 
        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(positive_scores, negative_scores)
        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)
        
        if  verbose == True:
            print(str(userid)+": "+ str(auc)+", "+str(eer) )
        auc_list.append(auc)
        eer_list.append(eer)
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )
  

    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores)
    
    filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value)
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution')
        # ****************************************************************************************

    if( roc_data == True ):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict) 
        df.to_csv(roc_data_filename, index=False)

    print("Global AUC: "+str(global_auc))
    print("Global EER: "+str(global_eer))