예제 #1
0
lCorr = 60 * 24 * 7 // MPS  # use one week data for calculating pearson correlation
R = 800

nNearPart = 2
mNear = np.identity(nR)
mNear = np.concatenate((mNear, (dists > 0) & (dists <= R)))
sMNear = np.repeat(mNear.sum(axis=1), nS * t_delta)
mNearTile = np.tile(mNear, (1, nS * t_delta))

score_ind = np.zeros((nT, nR * nS))
score_r = np.zeros((nT, nR)) + 100
score_int = np.zeros((nT, nR)) + 100
anomalies = np.zeros((nT, nR))
dVector = (t_delta - 1 + nNearPart) * nS

model_r = OneClassSVM(nu=0.1)
model_int = OneClassSVM(nu=0.1)
train_r = np.zeros((0, nS))
train_int = np.zeros((0, dVector))
tsTrain = 60 * 24 * 7 // MPS
nTrain = tsTrain * nR

detect_st = (datetime(2014, 11, 27) -
             stDT).days * 24 * 60 // MPS  # detect anomamlies in 2014-11-27
ed = (datetime(2014, 11, 28) - stDT).days * 24 * 60 // MPS
st = max(detect_st - tsTrain, lCorr)

trained = False
p1 = np.einsum('ij,ik->kj', data[(st - lCorr):st, :], data[(st - lCorr):st, :])
for ts in range(st, ed):
    print('\r' + str(ts), end='')
예제 #2
0
                                                                y,
                                                                test_size=0.3)
    y_train = []
    y_test = []

    for i in y_train_pre:
        if i == aa:
            y_train.append(1)
        # else:
        #     y_train.append(-1)

    for i in y_test_pre:
        if i == aa:
            y_test.append(1)
        else:
            y_test.append(-1)

    y_train = np.array(y_train)
    y_test = np.array(y_test)

    OCSVM = OneClassSVM(gamma=1, kernel='rbf')

    OCSVM.fit(X_train, y_train)

    ans = y_test - OCSVM.predict((X_test))

    i = 0
    for a in ans:
        if a == 0:
            i = i + 1
    print(aa, ":   ", i / len(ans))
예제 #3
0
def main():

    X = read_data()

    Y = read_labels()

    isf = IsolationForest(**ISF_HYPER_PARAMS)
    lof = LocalOutlierFactor(**LOF_HYPER_PARAMS)
    svm = OneClassSVM(**SVM_HYPER_PARAMS)
    cov = EllipticEnvelope(**COV_HYPER_PARAMS)
    kmn = KMeans(**KMN_HYPER_PARAMS)

    preds_isf = []
    preds_lof = []
    preds_svm = []
    preds_cov = []
    preds_kmn = []

    preds = []

    for user in range(0, num_of_users):
        X_all = X[user]
        X_labeled = X[user][0:num_of_genuine_segments]
        X_unlabeled = X[user][num_of_genuine_segments:]

        count_vect = CountVectorizer()
        tfidf_transformer = TfidfTransformer(use_idf=False)

        X_all_counts = count_vect.fit_transform(X_all)
        X_labeled_counts = count_vect.transform(X_labeled)
        X_unlabeled_counts = count_vect.transform(X_unlabeled)

        X_all_tfidf = tfidf_transformer.fit_transform(X_all_counts)
        X_labeled_tfidf = tfidf_transformer.transform(X_labeled_counts)
        X_unlabeled_tfidf = tfidf_transformer.transform(X_unlabeled_counts)

        isf.fit(X_all_tfidf)
        lof.fit(X_all_tfidf)
        svm.fit(X_all_tfidf)
        cov.fit(X_all_tfidf.toarray())
        kmn.fit(X_all_tfidf)

        pred_isf = isf.predict(X_unlabeled_tfidf)
        pred_lof = lof.predict(X_unlabeled_tfidf)
        pred_svm = svm.predict(X_unlabeled_tfidf)
        pred_cov = cov.predict(X_unlabeled_tfidf.toarray())
        pred_kmn = predict_by_euclidian_distance(X_unlabeled_tfidf, kmn)

        pred_isf = [1 if p == -1 else 0 for p in pred_isf]
        pred_lof = [1 if p == -1 else 0 for p in pred_lof]
        pred_svm = [1 if p == -1 else 0 for p in pred_svm]
        pred_cov = [1 if p == -1 else 0 for p in pred_cov]

        preds_lof.append(pred_lof)
        preds_isf.append(pred_isf)
        preds_svm.append(pred_svm)
        preds_cov.append(pred_cov)
        preds_kmn.append(pred_kmn)

        pred_sum = np.array(pred_lof) + np.array(pred_isf) + np.array(
            pred_svm) + np.array(pred_kmn) + np.array(pred_cov)

        majority = [1 if i > 2 else 0 for i in pred_sum]
        preds.append(majority)

    print("LOF:")
    evaluate_model(preds_lof, Y)
    print("ISF:")
    evaluate_model(preds_isf, Y)
    print("SVM:")
    evaluate_model(preds_svm, Y)
    print("COV:")
    evaluate_model(preds_cov, Y)
    print("KMN:")
    evaluate_model(preds_kmn, Y)

    print("TOTAL:")
    evaluate_model(preds, Y)

    WriteOutput(preds)
예제 #4
0
slc=np.r_[:,0:128]
trainX[slc]=trainX[slc].astype(np.float64)
testX[slc]=testX[slc].astype(np.float64)


trainy=trainy.to_frame()
testy=testy.to_frame()

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM
import pandas as pd

# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class
trainX = trainX[trainy==1]

        
model.fit(trainX)
# detect outliers in the test set
yhat = model.predict(testX)
# mark inliers 1, outliers -1
        
testy[testy == 1] = 1
testy[testy == 0] = -1

# calculate score
evaluate_results(testy, yhat)
예제 #5
0
plt.axvline(1993, label='Steroid Era Start - 1993', color='green')
plt.axvline(2004, label='Steroid Era End - 2003', color='green')
ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:1.3f}'))
ax.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper left')
plt.show()
#
# One Class SVM (support vector method) for anomaly detection
#
dfx = dfplot[['OPS', 'yearID']]
data = dfplot[['OPS']]
scaler = StandardScaler()
np_scaled = scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train oneclassSVM
model = OneClassSVM(nu=0.15, kernel="rbf", gamma=0.01)
model.fit(data)
dfx['anomaly3'] = pd.Series(model.predict(data))

# visualization of One Class SVM anomaly detection
fig, ax = plt.subplots(figsize=(15, 7))
a = dfx.loc[dfx['anomaly3'] == -1, ['yearID', 'OPS']]  #anomaly
ax.set_title('OPS Trend\nOne Class SVM Anomaly Detection\n',
             weight='bold',
             size=14)
ax.set_xlabel("Year", labelpad=10, size=14)
ax.set_ylabel("OPS", labelpad=10, size=14)
ax.plot(dfx['yearID'],
        dfx['OPS'],
        marker='.',
        linestyle='none',
예제 #6
0
 def __init__(self, kernel="rbf"):
     self._model = OneClassSVM(gamma='scale', kernel=kernel)
예제 #7
0
def generate_figures_and_xls(outdir, cols_starts, region2data, ext, xls,
                             group2pos, feature_names, samples):
    """Generate figures and tables"""
    all_freqs = []
    # concatenate all pos and samples into one dataframe
    dframes = []
    for ri, (ref, pos) in enumerate(
            region2data.keys()):  #regions): #[3]#; print(ref, pos, mt)
        mer, calls = region2data[(ref, pos)]
        for c, s in zip(calls, samples):
            df = pd.DataFrame(c, columns=feature_names)
            df["Strain"] = s
            df["chr_pos"] = "%s:%s" % (ref, pos)
            dframes.append(df)
    # read all tsv files
    df = pd.concat(dframes).dropna().reset_index()
    chr_pos, strains = df["chr_pos"].unique(), df["Strain"].unique()

    # compare individual methods
    for clf, method in (
        (iso_new.iForest(ntrees=100, random_state=0), "GMM+eIF"),
        (GaussianMixture(random_state=0, n_components=2), "GMM"),
        (AgglomerativeClustering(n_clusters=2), "AggClust"),
        (KMeans(n_clusters=2), "KMeans"),
        (OneClassSVM(), "OCSVM"),
        (IsolationForest(random_state=0), "IF"),
        (iso_new.iForest(ntrees=100, random_state=0), "eIF"),
        (KNeighborsClassifier(), "KNN"),
        (RandomForestClassifier(), "RF"),
    ):
        fname = method
        print(fname)
        outfn = os.path.join(outdir, "%s.%s" % (fname, ext))
        results = []
        for i, cols_start in enumerate(cols_starts, 1):
            # narrow down the features to only signal intensity & trace
            cols = list(
                filter(lambda n: n.startswith(cols_start), feature_names))
            cols  #, "DT"
            # compare all samples to 0%
            s0 = samples[0]
            for s in samples[3:]:
                with np.errstate(under='ignore'):
                    if "+" in method:
                        clf2_name = method.split("+")[-1]
                        results += get_mod_freq_two_step(df,
                                                         cols,
                                                         chr_pos, [s0, s],
                                                         "_".join(cols_start),
                                                         OFFSET=0.5,
                                                         clf2_name=clf2_name,
                                                         clf2=clf)
                    elif method in ("KNN", "RF"):
                        results += get_mod_freq_clf_train_test(
                            df, cols, chr_pos, [s0, s], samples[1:3], clf,
                            "_".join(cols_start))
                    else:
                        results += get_mod_freq_clf(df, cols, chr_pos, [s0, s],
                                                    clf, "_".join(cols_start))

        # and store mod_freq predicted by various methods
        freqs = pd.DataFrame(results,
                             columns=[
                                 "chr_pos", "features", "mod_freq wt",
                                 "mod_freq strain", "strain"
                             ])
        freqs["diff"] = freqs.max(axis=1) - freqs.min(axis=1)
        freqs
        for name, pos in group2pos.items(
        ):  #(("negative", negatives), ("pU", pU_pos), ("Nm", Nm_pos)):
            freqs.loc[freqs["chr_pos"].isin(pos), "group"] = name
        #freqs.to_csv(outfn, sep="\t"); freqs.head()
        freqs.to_excel(xls, fname, index=False)
        # plot differences between methods
        for group, pos in group2pos.items():
            freqs.loc[freqs["chr_pos"].isin(pos), "modification"] = group
        #g = sns.catplot(x="strain", y="diff", hue="features", col="modification", data=freqs, kind="box")#, palette="Blues")
        g = sns.catplot(x="strain",
                        y="diff",
                        hue="features",
                        col="modification",
                        data=freqs,
                        kind="point",
                        ci=None)  #, palette="Blues")
        fig = g.fig
        fig.suptitle(method)
        for ax in fig.axes:
            ax.set_xlabel("Expected mod_freq")
            ax.set_ylabel(
                "Observed mod_freq [absolute difference between wt & mt]")
            ax.set_ylim(0, 1)
        fig.savefig(outfn)
        plt.close()  # clear axis
        freqs["name"] = fname
        all_freqs.append(freqs)
    return all_freqs
예제 #8
0
def main(argv):
    config = read_parser(argv, Inputs, InputsOpt_Defaults)

    if config['mode'] == 'test':

        print('test')

    elif config['mode'] == 'learn_svm':
        #+++Load data
        if config['path'] == None:
            root = Tk()
            root.withdraw()
            root.update()
            filepath = filedialog.askopenfilename()
            root.destroy()
            filename = os.path.basename(filepath)
        else:
            filepath = config['path']
            filename = os.path.basename(filepath)

        #+++Construct features matrix and label vector
        myDF = pd.read_csv(filepath)
        mydict = myDF.to_dict(orient='list')
        n = len(mydict['Label'])
        y = np.zeros(n)
        y = [
            int(y[i]) if mydict['Label'][i] == 'Concrete' else 1
            for i in range(n)
        ]
        X = []
        Features = [
            'Area_under_curve', 'Crest-Factor', 'Energy', 'Kurtosis',
            'Peak_amplitude', 'RMS', 'Ring_down', 'Signal_strength', 'Skewnes',
            'StDev', 'Variance'
        ]
        k = 0
        for key in mydict.keys():
            if key in Features:
                X.append(mydict[key])
            k += 1
        X = np.array(X)
        X = np.transpose(X)

        #+++Train/Test Split
        if config['stratify'] == True:
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=config['test_size'],
                random_state=config['rs'],
                stratify=y)
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=config['test_size'], random_state=config['rs'])

        #+++Scaler
        if config['scaler'] == True:
            print('With standard scaler')
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        #+++PCA
        if config['pca'] != None:
            pca = PCA(n_components=config['pca'])
            pca.fit(X_train)
            print('PCA results: ', pca.explained_variance_ratio_)
            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)

        #+++Hyperparameters for Grid search
        Penalizations = [0.1, 1.0, 10.]
        Kernels = ['linear', 'rbf', 'poly']

        results = {}
        results['penal'] = []
        results['kernel'] = []

        results['accu_cv'] = []
        results['accu_te'] = []
        results['recall_cv'] = []
        results['recall_te'] = []
        results['preci_cv'] = []
        results['preci_te'] = []
        results['f1_cv'] = []
        results['f1_te'] = []

        count = 0
        for kernel_ in Kernels:
            for penal_ in Penalizations:
                print('+++++++Case = ', count)
                clf = SVC(kernel=kernel_,
                          C=penal_,
                          gamma='auto',
                          verbose=False,
                          max_iter=100000,
                          random_state=config['rs'])

                scores = cross_validate(clf,
                                        X_train,
                                        y_train,
                                        cv=config['cv'],
                                        scoring=('accuracy', 'recall',
                                                 'precision', 'f1'))

                clf.fit(X_train, y_train)
                Pred_y_test = clf.predict(X_test)

                score_test_accu = accuracy_score(y_test, Pred_y_test)
                score_test_recall = recall_score(y_test, Pred_y_test)
                score_test_preci = precision_score(y_test, Pred_y_test)
                score_test_f1 = f1_score(y_test, Pred_y_test)

                results['penal'].append(penal_)
                results['kernel'].append(kernel_)

                results['accu_cv'].append(scores['test_accuracy'].mean())
                results['recall_cv'].append(scores['test_recall'].mean())
                results['preci_cv'].append(scores['test_precision'].mean())
                results['f1_cv'].append(scores['test_f1'].mean())

                results['accu_te'].append(score_test_accu)
                results['recall_te'].append(score_test_recall)
                results['preci_te'].append(score_test_preci)
                results['f1_te'].append(score_test_f1)

                count += 1

        #+++Save results
        config['features'] = Features
        config['filename'] = filename

        name = datetime.now().strftime("%Y%m%d_%H%M%S")
        print(name)
        save_pickle('config_' + name + '.pkl', config)

        DataFr = pd.DataFrame(data=results, index=None)
        with pd.ExcelWriter('results_' + name + '.xlsx') as writer:
            DataFr.to_excel(writer, sheet_name='SVM_Learn')
        print('Result OK')

    elif config['mode'] == 'learn_oneclass':
        #+++Load data
        if config['path'] == None:
            root = Tk()
            root.withdraw()
            root.update()
            filepath = filedialog.askopenfilename()
            root.destroy()
            filename = os.path.basename(filepath)
        else:
            filepath = config['path']
            filename = os.path.basename(filepath)

        #+++Construct features matrix and label vector
        myDF = pd.read_csv(filepath)
        mydict = myDF.to_dict(orient='list')
        n = len(mydict['Label'])
        y = np.ones(n)
        y = [
            int(y[i]) if mydict['Label'][i] == 'Concrete' else -1
            for i in range(n)
        ]

        X = []
        Features = [
            'Area_under_curve', 'Crest-Factor', 'Energy', 'Kurtosis',
            'Peak_amplitude', 'RMS', 'Ring_down', 'Signal_strength', 'Skewnes',
            'StDev', 'Variance'
        ]
        k = 0
        for key in mydict.keys():
            if key in Features:
                X.append(mydict[key])
            k += 1
        X = np.array(X)
        X = np.transpose(X)

        #+++Train/Test Split
        if config['stratify'] == True:
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=config['test_size'],
                random_state=config['rs'],
                stratify=y)
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=config['test_size'], random_state=config['rs'])

        #+++Scaler
        if config['scaler'] == True:
            print('With standard scaler')
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        #+++PCA
        if config['pca'] != None:
            pca = PCA(n_components=config['pca'])
            pca.fit(X_train)
            print('PCA results: ', pca.explained_variance_ratio_)
            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)

        #+++Hyperparameters for Grid search
        Nus = [0.1, 0.5, 0.9]
        Kernels = ['linear', 'rbf', 'poly']

        results = {}
        results['nu'] = []
        results['kernel'] = []

        results['accu_cv'] = []
        results['accu_te'] = []
        results['recall_cv'] = []
        results['recall_te'] = []
        results['preci_cv'] = []
        results['preci_te'] = []
        results['f1_cv'] = []
        results['f1_te'] = []
        results['Baccu_cv'] = []
        results['Baccu_te'] = []
        results['Bpreci_cv'] = []
        results['Bpreci_te'] = []

        count = 0
        for kernel_ in Kernels:
            for nu_ in Nus:
                print('+++++++Case = ', count)
                clf = OneClassSVM(kernel=kernel_,
                                  nu=nu_,
                                  gamma='auto',
                                  verbose=False,
                                  max_iter=100000)

                # scores = cross_validate(clf, X_train, y_train, cv=config['cv'], scoring=('accuracy', 'recall', 'precision', 'f1'))
                scores = cross_validate(
                    clf,
                    X_train,
                    y_train,
                    cv=config['cv'],
                    scoring=('accuracy', 'recall', 'precision', 'f1',
                             'balanced_accuracy', 'average_precision'))

                clf.fit(X_train, y_train)
                Pred_y_test = clf.predict(X_test)

                score_test_accu = accuracy_score(y_test, Pred_y_test)
                score_test_recall = recall_score(y_test, Pred_y_test)
                score_test_preci = precision_score(y_test, Pred_y_test)
                score_test_f1 = f1_score(y_test, Pred_y_test)
                score_test_Baccu = balanced_accuracy_score(y_test, Pred_y_test)
                score_test_Bpreci = average_precision_score(
                    y_test, Pred_y_test)

                # results['penal'].append(penal_)
                results['nu'].append(nu_)
                results['kernel'].append(kernel_)

                results['accu_cv'].append(scores['test_accuracy'].mean())
                results['recall_cv'].append(scores['test_recall'].mean())
                results['preci_cv'].append(scores['test_precision'].mean())
                results['f1_cv'].append(scores['test_f1'].mean())
                results['Baccu_cv'].append(
                    scores['test_balanced_accuracy'].mean())
                results['Bpreci_cv'].append(
                    scores['test_average_precision'].mean())

                results['accu_te'].append(score_test_accu)
                results['recall_te'].append(score_test_recall)
                results['preci_te'].append(score_test_preci)
                results['f1_te'].append(score_test_f1)
                results['Baccu_te'].append(score_test_Baccu)
                results['Bpreci_te'].append(score_test_Bpreci)

                count += 1

        #+++Save results
        config['features'] = Features
        config['filename'] = filename

        name = datetime.now().strftime("%Y%m%d_%H%M%S")
        print(name)
        save_pickle('config_' + name + '.pkl', config)

        DataFr = pd.DataFrame(data=results, index=None)
        with pd.ExcelWriter('results_' + name + '.xlsx') as writer:
            DataFr.to_excel(writer, sheet_name='SVM_Learn')
        print('Result OK')

    else:
        print('error mode')

    return
예제 #9
0
score = 'neg_mean_absolute_error'

gscv = GridSearchCV(pipe, param_grid, cv=cv, scoring=score)
gscv.fit(X_train, y_train)
print_gscv_score(gscv)

y_pred = gscv.predict(X_train)
print('train data: ', end="")
print_score(y_train, y_pred)
# visualize
fig = yyplot(y_train, y_pred)

#%%
# Novelty detection by One Class SVM with optimized hyperparameter
clf = OneClassSVM(nu=0.003,
                  kernel=gscv.best_params_['model__kernel'],
                  gamma=gscv.best_params_['model__gamma'])
clf.fit(X_train)

y_pred = gscv.predict(X_test)  # predicted y
reliability = clf.predict(X_test)  # outliers = -1

data = []
output = 'test2.csv'
for i in range(len(X_test)):
    satom1 = periodic_table.get_el_sp(int(X_test[i][0]))
    satom2 = periodic_table.get_el_sp(int(X_test[i][1]))
    natom1 = int(X_test[i][2])
    natom2 = int(X_test[i][3])
    str_mat = str(satom1) + str(natom1) + str(satom2) + str(natom2)
    formula = Composition(str_mat).reduced_formula
def evaluate_authentication( df, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME):
    print(df.shape)
    userids = create_userids( df )
    NUM_USERS = len(userids)
    auc_list = list()
    eer_list = list()
    global_positive_scores = list()
    global_negative_scores = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]
        user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ]
        # Select data for training
        user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1)
        user_array = user_train_data.values
 
        num_samples = user_array.shape[0]
        train_samples = (int)(num_samples * 0.66) + 1
        test_samples = num_samples - train_samples
        if (verbose == True):
            print(str(userid)+". #train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples))
        user_train = user_array[0:train_samples,:]
        user_test = user_array[train_samples:num_samples,:]
     
        other_users_data = df.loc[~df.iloc[:, -1].isin([userid])]
        other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1)
        other_users_array = other_users_data.values   
        
        clf = OneClassSVM(gamma='scale')
        clf.fit(user_train)
 
        positive_scores = clf.score_samples(user_test)
        negative_scores =  clf.score_samples(other_users_array)   
        
        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        auc, eer,_,_ = compute_AUC_EER(y_pred_positive, y_pred_negative)
        
        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(positive_scores, negative_scores)
        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        if  verbose == True:
            print(str(userid)+", "+ str(auc)+", "+str(eer)+"\n" )
         
        auc_list.append(auc)
        eer_list.append(eer) 
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )
    
    print("#positives: "+str(len(global_positive_scores)))
    print("#negatives: "+str(len(global_negative_scores)))

    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores)
    
    filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) 
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution')
        # ****************************************************************************************

    if( roc_data == True ):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict) 
        df.to_csv(roc_data_filename, index=False)

        words = roc_data_filename.split('/')
        auc_eer_data_filename = words[0] +'/auc_eer_' + words[ 1 ]
        dict = {'AUC': auc_list, 'EER': eer_list}
        df = pd.DataFrame(dict) 
        df.to_csv(auc_eer_data_filename, index=False)
        
    print("Global AUC: "+str(global_auc))
    print("Global EER: "+str(global_eer))
    return auc_list, eer_list
예제 #11
0
def model_ocsvm(train_x, test_x):
    model = OneClassSVM(gamma='auto', kernel='linear')
    model.fit(train_x)
    pred = model.predict(test_x)
    return model, pred
def evaluate_authentication_cross_day( df1, df2, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME ):
    print("Session 1 shape: "+str(df1.shape))
    print("Session 2 shape: "+str(df2.shape))
        
    userids = create_userids( df1 )
    NUM_USERS = len(userids)
    
    global_positive_scores = list()
    global_negative_scores = list()
    auc_list = list()
    eer_list = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]

        user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])]
        user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])]
      
        user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1)
        user_session1_array = user_session1_data.values
 
        # positive test data
        user_session2_data =  user_session2_data.drop(user_session2_data.columns[-1], axis=1) 
        user_session2_array = user_session2_data.values

        # negative test data
        other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])]
        other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1)
        other_users_session2_array = other_users_session2_data.values   
        
        clf = OneClassSVM(gamma='scale')
        clf.fit(user_session1_array)
 
        positive_scores = clf.score_samples(user_session2_array)
        negative_scores =  clf.score_samples(other_users_session2_array)   

        # Aggregating positive scores
        y_pred_positive = positive_scores
        for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        # Aggregating negative scores
        y_pred_negative = negative_scores
        for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1):
            y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0)

        auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative)

        
        # auc, eer = compute_AUC_EER(positive_scores, negative_scores )
        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(positive_scores, negative_scores)

        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)

        

        if verbose == True:
            print(str(userid)+": "+ str(auc)+", "+str(eer) )
        auc_list.append(auc)
        eer_list.append(eer)
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )

    
    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores)
    
    filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) 
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution')
        # ****************************************************************************************

    if( roc_data == True ):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict) 
        df.to_csv(roc_data_filename, index=False)

    print("Global AUC: "+str(global_auc))
    print("Global EER: "+str(global_eer))
    return auc_list, eer_list
def evaluate_authentication_skilledforgeries( df_genuine, df_forgery, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME):
    print("Genuine shape: "+str(df_genuine.shape))
    print("Forgery shape: "+str(df_forgery.shape))
    print(df_forgery.shape)
    userids = create_userids( df_genuine )
    NUM_USERS = len(userids)
    
    global_positive_scores = list()
    global_negative_scores = list()
    auc_list = list()
    eer_list = list()
    for i in range(0,NUM_USERS):
        userid = userids[i]
        user_genuine_data = df_genuine.loc[df_genuine.iloc[:, -1].isin([userid])]
        user_forgery_data = df_forgery.loc[df_forgery.iloc[:, -1].isin([userid])]
      
        user_genuine_data = user_genuine_data.drop(user_genuine_data.columns[-1], axis=1)
        user_genuine_array = user_genuine_data.values
 
        num_samples = user_genuine_array.shape[0]
        train_samples = (int)(num_samples * 0.66)
        test_samples = num_samples - train_samples
        # MCYT
        # train_samples = 15
        # test_samples = 10

        user_genuine_train = user_genuine_array[0:train_samples,:]
        user_genuine_test = user_genuine_array[train_samples:num_samples,:]
     
        user_forgery_data =  user_forgery_data.drop(user_forgery_data.columns[-1], axis=1) 
        user_forgery_array = user_forgery_data.values

        clf = OneClassSVM(gamma='scale')
        clf.fit(user_genuine_train)
 
        positive_scores = clf.score_samples(user_genuine_test)
        negative_scores =  clf.score_samples(user_forgery_array)   
        auc, eer,_,_ = compute_AUC_EER(positive_scores, negative_scores )
 
        if SCORE_NORMALIZATION == True:
            positive_scores, negative_scores = score_normalization(positive_scores, negative_scores)
        global_positive_scores.extend(positive_scores)
        global_negative_scores.extend(negative_scores)
        
        if  verbose == True:
            print(str(userid)+": "+ str(auc)+", "+str(eer) )
        auc_list.append(auc)
        eer_list.append(eer)
    print('AUC  mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) )
    print('EER  mean:  %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) )
  

    global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores)
    
    filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value)
    if SCORES == True:
        # ****************************************************************************************
        plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution')
        # ****************************************************************************************

    if( roc_data == True ):
        dict = {'FPR': fpr, 'TPR': tpr}
        df = pd.DataFrame(dict) 
        df.to_csv(roc_data_filename, index=False)

    print("Global AUC: "+str(global_auc))
    print("Global EER: "+str(global_eer))
예제 #14
0
def outlierDetection(Xtrain, Xtest):
    outlierDetector = OneClassSVM(kernel='rbf', gamma = 0.1, nu = 0.001)
    outlierDetector.fit(Xtrain)
    return outlierDetector.predict(Xtest)
예제 #15
0
    'n_jobs': 20,
    'trainfrac': 0.5,
    'sim_reps': 2,
    'score_thresh': 0.5
})

X, Y, reg_ind, anom_ind = dl.loaddata()

iforest = IsolationForest(n_estimators=params['n_estimators'],
                          max_samples=params['max_samples'],
                          max_features=params['max_features'],
                          n_jobs=params['n_jobs'],
                          behaviour='new',
                          contamination=0.001)

ocsvm = OneClassSVM(gamma='scale', nu=0.05)


def traintest(model, trainX, trainY, testX, testY):
    all_metrics = dict({'f1': [], 'precision': [], 'recall': [], 'mcc': []})
    model.fit(trainX)
    train_scores = model.score_samples(trainX)
    thresh = np.percentile(train_scores, 5)
    scores = model.score_samples(testX)
    predY = np.where(scores <= thresh, ANOM_LAB, REG_LAB)
    prec, recall, f1, _ = precision_recall_fscore_support(testY,
                                                          predY,
                                                          average="binary",
                                                          pos_label=ANOM_LAB)
    all_metrics['f1'] = f1
    all_metrics['precision'] = prec
예제 #16
0
from data_parcing import data_parcing
from SVM import *

dp = data_parcing('dataset')
temp, hum, gas = dp.test_data_convert_format()

temp= np.array(temp)
data_temp_x= temp[:, 1].reshape(-1, 1)
data_temp_y= temp[:, 2]

data_temp_x, data_temp_y = data_split(temp)

train_temp_x, test_temp_x, train_temp_y, test_temp_y= train_test_split(data_temp_x, data_temp_y, test_size=0.3,
                                                                       random_state=123, shuffle=True)

model_temp= OneClassSVM(gamma='auto', kernel="linear")
model_temp.fit(train_temp_x)
pred_temp_y=model_temp.predict(test_temp_x)

cnt=0
print("len:", len(pred_temp_y))
for i in range(len(pred_temp_y)):
    if pred_temp_y[i]==-1:
        pred_temp_y[i]=0
        cnt+=1
print("cnt:", cnt)

test_temp_y = np.array(list(map(int, test_temp_y)))
acc= accuracy_score(test_temp_y, pred_temp_y)

# test_temp_y=np.array(test_temp_y)
from os.path import dirname

from matplotlib.pyplot import plot, show, title
from numpy import array, genfromtxt
from sklearn.svm import OneClassSVM

if __name__ == "__main__":
    DATA_SET_1 = array(
        genfromtxt(dirname(__file__) + "\\" + "women.csv", delimiter=","))

    NU_VAL = 1 / 25
    GAMMA_VAL = 1 / 3500000000

    SVM_MODEL = OneClassSVM(nu=NU_VAL, gamma=GAMMA_VAL)

    SVM_MODEL.fit(DATA_SET_1)
    DATA_SET_1_PRED = SVM_MODEL.predict(DATA_SET_1)

    NORMAL = DATA_SET_1[DATA_SET_1_PRED == 1]
    ABNORMAL = DATA_SET_1[DATA_SET_1_PRED == -1]

    plot(NORMAL[:, 0], NORMAL[:, 1], "bx")
    plot(ABNORMAL[:, 0], ABNORMAL[:, 1], "ro")
    title("gamma = " + str(SVM_MODEL.gamma) + ", nu = " + str(SVM_MODEL.nu))
    show()
LightGBM = LGBMClassifier(n_estimators=115,
                          num_leaves=65,
                          max_depth=15,
                          min_child_samples=40,
                          learning_rate=0.1,
                          boosting_type='gbdt',
                          objective='binary',
                          random_state=42,
                          n_jobs=-1,
                          silent=True)
Naive_Bayes = GaussianNB(var_smoothing=1e0)
One_Class_SVM = OneClassSVM(kernel='rbf',
                            degree=3,
                            gamma='scale',
                            coef0=0.0,
                            tol=0.001,
                            nu=0.05,
                            shrinking=True,
                            cache_size=200,
                            verbose=False,
                            max_iter=-1)
Isolation_Forest = IsolationForest(n_estimators=100,
                                   max_samples='auto',
                                   contamination='auto',
                                   max_features=1.0,
                                   bootstrap=True,
                                   n_jobs=-1,
                                   random_state=42)
Auto_Enc_LogReg = Log_Reg
Auto_Enc_LightGBM = LightGBM

score_met = 'average_precision'
 def training(self):
     self.clf = OneClassSVM(kernel='rbf',gamma=26)
     self.clf.fit(self.train)
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(data.corr(), annot=True,cmap ='RdYlGn')

X=data.iloc[:,:-1].values
Y=data.iloc[:,-1].values


####### standard scale ########
sc=StandardScaler()
X=sc.fit_transform(X)

######### Outlier ##########

outliers_fraction = 0.01
outlier_model = OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.01)
outlier_model.fit(X)
out = outlier_model.predict(X)


df = pd.DataFrame({'out_prediction':out})
df=df[df['out_prediction']==1]

b=set(df.index.values.tolist())
a=[]
for i in range(0,len(X)):
    if i in b:
        a.append(X[i])
X=np.array(a)   

c=[]
def base_experiment(config, ntrials=1, seed=123456789):
    """
	Run a single experiment, locally.
		
	@param config: The configuration parameters to use for the SP.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""

    # Base parameters
    ntrain, ntest = 800, 200
    clf_th = 0.5

    # Seed numpy
    np.random.seed(seed)

    # Get the data
    (tr_x, tr_y), (te_x, te_y) = load_mnist()
    tr_x_0 = np.random.permutation(tr_x[tr_y == 0])
    x_tr = tr_x_0[:ntrain]
    x_te = tr_x_0[ntrain:ntrain + ntest]
    outliers = [
        np.random.permutation(tr_x[tr_y == i])[:ntest] for i in xrange(1, 10)
    ]

    # Metrics
    metrics = SPMetrics()

    # Get the metrics for the datasets
    u_x_tr = metrics.compute_uniqueness(x_tr)
    o_x_tr = metrics.compute_overlap(x_tr)
    c_x_tr = 1 - metrics.compute_distance(x_tr)
    u_x_te = metrics.compute_uniqueness(x_te)
    o_x_te = metrics.compute_overlap(x_te)
    c_x_te = 1 - metrics.compute_distance(x_te)
    u_y_te, o_y_te, c_y_te = [], [], []
    for outlier in outliers:
        u_y_te.append(metrics.compute_uniqueness(outlier))
        o_y_te.append(metrics.compute_overlap(outlier))
        c_y_te.append(1 - metrics.compute_distance(outlier))

    # Initialize the overall results
    sp_x_results = np.zeros(ntrials)
    sp_y_results = [np.zeros(ntrials) for _ in xrange(9)]
    svm_x_results = np.zeros(ntrials)
    svm_y_results = [np.zeros(ntrials) for _ in xrange(9)]

    # Iterate across the trials:
    for nt in xrange(ntrials):
        # Make a new seeod
        seed2 = np.random.randint(1000000)
        config['seed'] = seed2

        # Create the SP
        sp = SPRegion(**config)

        # Fit the SP
        sp.fit(x_tr)

        # Get the SP's output
        sp_x_tr = sp.predict(x_tr)
        sp_x_te = sp.predict(x_te)
        sp_y_te = [sp.predict(outlier) for outlier in outliers]

        # Get the metrics for the SP's results
        u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
        o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
        c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
        u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
        o_sp_x_te = metrics.compute_overlap(sp_x_te)
        c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
        u_sp_y_te, o_sp_y_te, c_sp_y_te = [], [], []
        for y in sp_y_te:
            u_sp_y_te.append(metrics.compute_uniqueness(y))
            o_sp_y_te.append(metrics.compute_overlap(y))
            c_sp_y_te.append(1 - metrics.compute_distance(y))

        # Log all of the metrics
        sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
        sp._log_stats('Input Base Class Train Overlap', o_x_tr)
        sp._log_stats('Input Base Class Train Correlation', c_x_tr)
        sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
        sp._log_stats('Input Base Class Test Overlap', o_x_te)
        sp._log_stats('Input Base Class Test Correlation', c_x_te)
        sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
        sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
        sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
        sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
        sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
        sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
        for i, (a, b, c, d, e, f) in enumerate(
                zip(u_y_te, o_y_te, c_y_te, u_sp_y_te, o_sp_y_te, c_sp_y_te),
                1):
            sp._log_stats('Input Novelty Class {0} Uniqueness'.format(i), a)
            sp._log_stats('Input Novelty Class {0} Overlap'.format(i), b)
            sp._log_stats('Input Novelty Class {0} Correlation'.format(i), c)
            sp._log_stats('SP Novelty Class {0} Uniqueness'.format(i), d)
            sp._log_stats('SP Novelty Class {0} Overlap'.format(i), e)
            sp._log_stats('SP Novelty Class {0} Correlation'.format(i), f)

        # Get average representation of the base class
        sp_base_result = np.mean(sp_x_tr, 0)
        sp_base_result[sp_base_result >= 0.5] = 1
        sp_base_result[sp_base_result < 1] = 0

        # Averaged results for each metric type
        u_sp_base_to_x_te = 0.
        o_sp_base_to_x_te = 0.
        c_sp_base_to_x_te = 0.
        u_sp, o_sp, c_sp = np.zeros(9), np.zeros(9), np.zeros(9)
        for i, x in enumerate(sp_x_te):
            xt = np.vstack((sp_base_result, x))
            u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
            o_sp_base_to_x_te += metrics.compute_overlap(xt)
            c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)

            for j, yi in enumerate(sp_y_te):
                yt = np.vstack((sp_base_result, yi[i]))
                u_sp[j] += metrics.compute_uniqueness(yt)
                o_sp[j] += metrics.compute_overlap(yt)
                c_sp[j] += 1 - metrics.compute_distance(yt)
        u_sp_base_to_x_te /= ntest
        o_sp_base_to_x_te /= ntest
        c_sp_base_to_x_te /= ntest
        for i in xrange(9):
            u_sp[i] /= ntest
            o_sp[i] /= ntest
            c_sp[i] /= ntest

        # Log the results
        sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te)
        sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
        sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
        for i, j in enumerate(xrange(1, 10)):
            sp._log_stats('Base Train to Novelty {0} Uniqueness'.format(j),
                          u_sp[i])
            sp._log_stats('Base Train to Novelty {0} Overlap'.format(j),
                          o_sp[i])
            sp._log_stats('Base Train to Novelty {0} Correlation'.format(j),
                          c_sp[i])

        # Create an SVM
        clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)

        # Evaluate the SVM's performance
        clf.fit(x_tr)
        svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
         100
        svm_y_te = np.array([
            len(np.where(clf.predict(outlier) == -1)[0]) / float(ntest) * 100
            for outlier in outliers
        ])

        # Perform classification using overlap as the feature
        # -- The overlap must be above 50%
        clf_x_te = 0.
        clf_y_te = np.zeros(9)
        for i, x in enumerate(sp_x_te):
            xt = np.vstack((sp_base_result, x))
            xo = metrics.compute_overlap(xt)
            if xo >= clf_th: clf_x_te += 1

            for j, yi in enumerate(sp_y_te):
                yt = np.vstack((sp_base_result, yi[i]))
                yo = metrics.compute_overlap(yt)
                if yo < clf_th: clf_y_te[j] += 1
        clf_x_te = (clf_x_te / ntest) * 100
        clf_y_te = (clf_y_te / ntest) * 100

        # Store the results as errors
        sp_x_results[nt] = 100 - clf_x_te
        sp_y_results[nt] = 100 - clf_y_te
        svm_x_results[nt] = 100 - svm_x_te
        svm_y_results[nt] = 100 - svm_y_te

        # Log the results
        sp._log_stats('SP % Correct Base Class', clf_x_te)
        sp._log_stats('SVM % Correct Base Class', svm_x_te)
        for i, j in enumerate(xrange(1, 10)):
            sp._log_stats('SP % Correct Novelty Class {0}'.format(j),
                          clf_y_te[i])
            sp._log_stats('SVM % Correct Novelty Class {0}'.format(j),
                          svm_y_te[i])
        sp._log_stats('SP % Mean Correct Novelty Class', np.mean(clf_y_te))
        sp._log_stats('SVM % Mean Correct Novelty Class', np.mean(svm_y_te))
        sp._log_stats('SP % Adjusted Score',
                      (np.mean(clf_y_te) * clf_x_te) / 100)
        sp._log_stats('SVM % Adjusted Score',
                      (np.mean(svm_y_te) * svm_x_te) / 100)

    return sp_x_results, sp_y_results, svm_x_results, svm_y_results
예제 #22
0
 def test_one_class_svm(self):
     model = OneClassSVM()
     dump_one_class_classification(model, folder=self.folder)
예제 #23
0
파일: osvm.py 프로젝트: alisonmoura/tcc
def run(data_class, out_class=[], printer=Printer()):
    has_out = np.any(out_class)
    start = time.time()

    print('Target data shape: (%d,%d)' %
          (data_class.shape[0], data_class.shape[1]))
    X = np.delete(data_class, -1, axis=1)
    y = data_class[:, -1]
    # print(y)

    if (has_out):
        print('Has out class: Yes')
        print('Out data shape: (%d,%d)' %
              (out_class.shape[0], out_class.shape[1]))
        X_out = np.delete(out_class, -1, axis=1)
        y_out = out_class[:, -1]
        # print(y_out)

    print(data_class.shape, X.shape, y.shape)

    clf = OneClassSVM(gamma='scale', nu=0.01)
    kf = KFold(n_splits=5)
    kf.get_n_splits(X)

    param_dist = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
        'nu': stats.uniform(.0, .05),
        'shrinking': [True, False]
    }

    n_inter = 20
    clf = RandomizedSearchCV(clf,
                             param_distributions=param_dist,
                             n_iter=n_inter,
                             cv=5,
                             scoring="accuracy")

    f1_scores = []
    precision_scores = []
    recall_scores = []
    accuracy_scores = []

    run_time_start = time.time()

    print(kf)
    for train_index, test_index in kf.split(X):
        round_time_start = time.time()

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        if (has_out):
            X_test = np.concatenate((X_test, X_out))
            y_test = np.concatenate((y_test, y_out))

        clf = clf.fit(X_train, y_train)
        y_pred_test = clf.predict(X_test)

        # print(y_test)
        # print(y_pred_test)

        round_time_end = time.time()
        n_error_test = (y_pred_test != y_test).sum()
        f1_test_score = f1_score(y_test, y_pred_test, pos_label=-1)
        precision_test_score = precision_score(y_test, y_pred_test)
        recall_test_score = recall_score(y_test, y_pred_test)
        accuracy_test_score = accuracy_score(y_test, y_pred_test)

        printer.print_write("\n=============ITERATION SCORES=============\n")

        printer.print_write(
            tabulate([
                ['Metric', 'Value'],
                ['Test error:', '{:d}'.format(n_error_test)],
                ['Test F1 Score:', '%.3f' % f1_test_score],
                ['Test Precision Score:',
                 '%.3f' % precision_test_score],
                ['Test Recall Score:',
                 '%.3f' % recall_test_score],
                ['Test Accuracy Score:',
                 '%.3f' % accuracy_test_score],
                [
                    'Iteration time:',
                    '%.2f seconds' % (round_time_end - round_time_start)
                ],
            ],
                     headers="firstrow"))

        # printer.print_write("Test error: {:d}".format(n_error_test))
        # printer.print_write('Test F1 Score: %.3f' % f1_test_score)
        # printer.print_write('Test Precision Score: %.3f' % precision_test_score)
        # printer.print_write('Test Recall Score: %.3f' % recall_test_score)
        # printer.print_write('Test Accuracy Score: %.3f' % accuracy_test_score)
        # printer.print_write("Iteration time: %.2f seconds" % (round_time_end - round_time_start))

        f1_scores.append(f1_test_score)
        precision_scores.append(precision_test_score)
        recall_scores.append(recall_test_score)
        accuracy_scores.append(accuracy_test_score)

    run_time_end = time.time()
    f1_scores = np.array(f1_scores)
    precision_scores = np.array(precision_scores)
    recall_scores = np.array(recall_scores)

    printer.print_write("\n=============FINAL SCORES=============\n")
    printer.print_write("F1 Score Final: %f" %
                        (f1_scores.sum() / f1_scores.size))
    printer.print_write("Precision Score Final: %f" %
                        (precision_scores.sum() / precision_scores.size))
    printer.print_write("Recall Score Final: %f" %
                        (recall_scores.sum() / recall_scores.size))
    printer.print_write("Accuracy Score Final: %f" %
                        (accuracy_scores.sum() / accuracy_scores.size))
    printer.print_write("Final time: %.2f seconds" %
                        (run_time_end - run_time_start))

    end = time.time()
    printer.print_write("\n=============TIME=============\n")
    printer.print_write("It took: %.2f seconds" % (end - start))
예제 #24
0
    df = pd.read_csv('data/datalab_persona_cont.csv')

    X_outliers = df[df['FKSmoker'] == 0]

    X_outliers.drop(['FKSmoker'], inplace=True, axis=1)

    X = df[df['FKSmoker'] == 1]

    X.drop(['FKSmoker'], inplace=True, axis=1)

    X_train = X.sample(frac=0.9)

    X_test = X.drop(df.index[list(X_train.index)])

    # fit the model
    clf = OneClassSVM(gamma=0.3, kernel='rbf')
    clf.fit(X_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    y_pred_outliers = clf.predict(X_outliers)

    outliers = pd.Series(y_pred_test, name='verdict')

    print(len(outliers[outliers == 1]))
    print(len(y_pred_test))

    outliers = pd.Series(y_pred_outliers, name='verdict')

    print(len(outliers[outliers == -1]))
    print(len(y_pred_outliers))
예제 #25
0
def test_explain_one_class_svm_unsupported():
    X = np.array([[0,0], [0, 1], [5, 3], [93, 94], [90, 91]])
    clf = OneClassSVM().fit(X)
    expl = explain_weights(clf)
    assert 'supported' in expl.error
예제 #26
0
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, gscv.predict(X_test)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_test, y_pred))
print()

#%%

# Novelty detection by One Class SVM with optimized hyperparameter
clf = OneClassSVM(nu=0.10,
                  kernel=gscv.best_params_['kernel'],
                  gamma=gscv.best_params_['gamma'])
clf.fit(X_train)

y_pred = gscv.predict(X_test)  # prediction
reliability = clf.predict(X_test)  # outliers = -1
results = np.c_[y_pred, y_test, reliability]
#print('y_predicted, y_true, outliers = -1')
#print(y_tot)
#print()

#%%
df = pd.DataFrame(results, columns=list('ABC'))
df_in_ = df[df.C == 1]
df_out = df[df.C == -1]
print('Inlier  sample, number of good/bad predictions: {} {}'.format(
예제 #27
0
from sklearn.svm import OneClassSVM

from src.filereader.FileReader import FileReader

FILE_PATH = "../../resource/PostureEntry.csv"
N_VALIDATIONS = 4

samples, labels, label_names = FileReader.read(FILE_PATH)

samples = preprocessing.scale(samples)

labels = labels.ravel()

trainsamples = samples[(labels == 0) | (labels == 2) | (labels == 4), :]
testsamples = samples[(labels == 1) | (labels == 3) | (labels == 5), :]

labels[(labels == 0) | (labels == 2) | (labels == 4)] = 0
labels[(labels != 0)] = 1

clf = OneClassSVM()
clf.fit(trainsamples)

y_train = clf.predict(trainsamples)
y_test = clf.predict(testsamples)

error_train = y_train[y_train == -1].size / y_train.size
error_test = y_test[y_test == -1].size / y_test.size

print("Train error " + str(error_train))
print("Test error " + str(error_test))
예제 #28
0
data_train = pickle.load(concating_data_train)
concating_data_train.close()

concating_data_test = open(
    "/home/alperen/Desktop/Thesis_Application/data_test", 'rb')
data_test = pickle.load(concating_data_test)
concating_data_test.close()

pickle_test_label = open("/home/alperen/Desktop/Thesis_Application/test_label",
                         'rb')
test_label = pickle.load(pickle_test_label)
pickle_test_label.close()

# train and predict phase

o_svm = OneClassSVM()  #kernel='rbf', gamma=0.001, nu=0.01
o_svm.fit(data_train)
anomaly_detect = o_svm.predict(data_test)

pickle_predicted_data = open(
    "/home/alperen/Desktop/Thesis_Application/predicted_data", 'wb')
predicted_data = pickle.dump(anomaly_detect, pickle_predicted_data)
pickle_predicted_data.close()

# pickle_predicted_data = open("/home/alperen/Desktop/Thesis_Application/predicted_data", 'rb')
# anomaly_detect = pickle.load(pickle_predicted_data)
# pickle_predicted_data.close()

unique, counts = np.unique(anomaly_detect, return_counts=True)
print(np.asarray((unique, counts)).T)
def oneclass_svm(dataset, kernel, nu):
    svm = OneClassSVM(kernel=kernel, nu=nu).fit(dataset)
    return svm
예제 #30
0
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1),
              name + ".csv")


if "Housing" in datasets:
    build_iforest_housing(IsolationForest(random_state=13),
                          "IsolationForestHousing")


def build_ocsvm_housing(svm, name):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()),
                         ("estimator", svm)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) <= 0,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1),
              name + ".csv")


if "Housing" in datasets:
    build_ocsvm_housing(OneClassSVM(nu=0.10, random_state=13),
                        "OneClassSVMHousing")