예제 #1
0
def classifiers(data, n_PCA, save_dir):
    warnings.filterwarnings("ignore")
    from sklearn import svm
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import DBSCAN
    from sklearn.decomposition import PCA
    from sklearn.neighbors import NearestNeighbors
    import seaborn as sns
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import ShuffleSplit
    from sklearn.neural_network import MLPClassifier
    from sklearn.naive_bayes import GaussianNB

    dataset_path = "/home/det_tesi/sgarofalo/GridSearchGNB/train_dataset/dataset.csv"
    dataset = pd.read_csv(dataset_path)
    colors = ['r', 'b']
    classes = [0, 1]  # 0 = Not RTP and 1 = RTP
    columns = dataset.columns[:-1]
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    y[y.label == 'RTP'] = 1
    y[y.label == 'Not RTP'] = 0
    y = np.array(y.label)
    X = StandardScaler().fit_transform(X)
    print("train_dataset shape: " + str(dataset.shape))
    print("train_dataset classes distribution: %.2f%% RTP" %
          (100 * len(y[y == 1]) / len(y)))

    # #############################################################################
    # Correlation Matrix
    # #############################################################################
    df_corr = pd.DataFrame(data=np.column_stack((X, y)),
                           columns=dataset.columns)
    corr = df_corr.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    plt.figure(figsize=(10, 10))
    ax = sns.heatmap(corr,
                     xticklabels=corr.columns.values,
                     yticklabels=corr.columns.values,
                     annot=True,
                     cmap="YlGnBu",
                     mask=mask)
    t = "Correlation matrix"
    save_photo(save_dir, t, str(n_PCA) + " PCA")

    # #############################################################################
    # Plot Data Using PCA
    # #############################################################################
    myModel = PCA(n_PCA)
    PC = myModel.fit_transform(X)
    principalDf = pd.DataFrame(
        data=np.column_stack((PC[:, 0:2], y)),
        columns=['principal component 1', 'principal component 2', 'label'])
    fig = plt.figure(figsize=(13, 13))
    #plt.scatter(principalDf.iloc[:, 0], principalDf.iloc[:, 1])
    for target, color in zip(classes, colors):
        indicesToKeep = principalDf['label'] == target
        plt.scatter(principalDf.loc[indicesToKeep, 'principal component 1'],
                    principalDf.loc[indicesToKeep, 'principal component 2'],
                    c=color,
                    s=40)
    plt.xlabel('Principal Component 1', fontsize=15)
    plt.ylabel('Principal Component 2', fontsize=15, labelpad=-10)
    plt.legend(classes)
    plt.grid()
    t = "Data Plotting using PCA"
    plt.title(t, fontsize=10)
    save_photo(save_dir, t, str(n_PCA) + " PCA")
    '''
    # #############################################################################
    # Feature characterization
    # #############################################################################
    labels = ['Not RTP', 'RTP']
    for feature in columns:
        for label in labels:
            if 'interarrival' in feature: color='r'
            elif 'len_udp' in feature: color = '#815EA4'
            elif "interlength" in feature: color = '#1E8449'
            elif "rtp_inter" in feature: color = 'c'
            elif "kbps" in feature: color = '#A82828'
            elif "num_packets" in feature: color = '#6C3483'

            plt.figure(figsize=(13,8))
            plt.grid()
            dataset[dataset.label == label][feature].hist(bins=50, density=True, color=color)
            t = feature + ' hist ' + label
            plt.title(t, fontsize=20)
            plt.tight_layout()
            save_photo(save_dir, t, str(n_PCA)+" PCA")

            plt.figure(figsize=(13,8))
            xplot, yplot = ecdf(dataset[dataset.label == label][feature])
            plt.plot(xplot, yplot, lw=3, color=color)
            plt.grid()
            t = feature + ' CDF ' + label
            plt.title(t, fontsize=20)
            plt.tight_layout()
            save_photo(save_dir, t, str(n_PCA)+" PCA")
    
    # #############################################################################
    # SVM
    # #############################################################################
    X_train, X_test, y_train, y_test = train_test_split(PC, y, test_size=0.3, random_state=1)
    model = svm.SVC()
    C = [0.01, 0.1, 1, 10, 100, 1000]
    kernel = ['rbf']
    gamma = [0.001, 0.01, 0.1, 1, 10, 100]
    params = {'C': C, 'kernel': kernel, 'gamma': gamma}
    SVM = GridSearchCV(model, params, cv=5, n_jobs=-1, iid=True)
    SVM.fit(X_train, y_train)
    SVM = SVM.best_estimator_
    t = "SVM Learning Curve"
    plot_learning_curve(SVM, t, X, y, ylim=(0.0, 1.10), cv=cv, n_jobs=-1)
    save_photo(save_dir, t, str(n_PCA)+" PCA")
    '''
    # #############################################################################
    # Gaussian Naive Bayes
    # #############################################################################
    GNB = GaussianNB()
    GNB.fit(PC, y)
    '''
    # #############################################################################
    # MultiLayerPerceptron
    # #############################################################################
    classifier = "MLP"
    model = MLPClassifier()
    hidden_layer_sizes = [(50,50,50), (50,100,50), (100,)]
    max_iter = [200, 1000, 5000, 10000]
    activation = ['tanh', 'relu']
    alpha = [0.0001, 0.05]
    solver = ['sgd', 'adam']
    params = {'hidden_layer_sizes': hidden_layer_sizes, 'max_iter': max_iter, 'activation': activation, 'alpha': alpha}
    MLP = GridSearchCV(model, params, cv=5, n_jobs=-1, iid=True)
    MLP.fit(X_train, y_train)
    MLP = MLP.best_estimator_
    
    # #############################################################################
    # Random Forest
    # #############################################################################
    model = RandomForestClassifier()
    max_leaf = [5, 10]
    min_samples = [1, 3]
    min_samples_split = [2, 16, 32]
    max_depth = [None, 8, 32]
    max_features = [3, 5, 7]
    n_estimators = [200, 500, 1000, 2000]
    params = {
        #'max_features': max_features,
        'n_estimators': n_estimators
        #'max_depth': max_depth, 
		#'min_samples_split': min_samples_split, 
		#'max_leaf_nodes': max_leaf, 
		#'min_samples_leaf': min_samples
    }
    RF = GridSearchCV(model, params, cv=10, n_jobs=-1)
    RF.fit(X_train, y_train)
    RF = RF.best_estimator_
    
    feature_imp = pd.Series(RF.feature_importances_, index=columns)
    fig = plt.figure(figsize = (13,13))
    sns.barplot(x=feature_imp, y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.legend()
    t = "Important Features"
    plt.title(t, fontsize = 10)
    save_photo(save_dir, t, str(seconds)+"s")
    
    t = "RF Learning Curve"
    plot_learning_curve(RF, t, X, y, ylim=(0.0, 1.10), cv=cv, n_jobs=-1)
    save_photo(save_dir, t, str(seconds)+"s")
    
    # #############################################################################
    # KNN
    # #############################################################################
    model = KNeighborsClassifier()
    metric = ["manhattan", "euclidean", "chebyshev"]
    weights = ['uniform', 'distance']
    params = {"metric": metric, 'weights': weights, 'n_neighbors': range(1, 20)}
    KNN = GridSearchCV(model, params, cv=10, n_jobs=-1)
    KNN.fit(X_train, y_train)
    KNN = KNN.best_estimator_
    
    t = "KNN Learning Curve"
    plot_learning_curve(KNN, t, X, y, ylim=(0.0, 1.10), cv=cv, n_jobs=-1)
    save_photo(save_dir, t, str(n_PCA)+" PCA")
    '''
    # #############################################################################
    # Test Set
    # #############################################################################
    dataset = pd.read_csv(
        "/home/det_tesi/sgarofalo/GridSearchGNB/test_dataset/dataset.csv")
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    y[y.label == 'RTP'] = 1
    y[y.label == 'Not RTP'] = 0
    print("test_dataset shape: " + str(dataset.shape))
    print("test_dataset classes distribution: %.2f%% RTP" %
          (100 * len(y[y == 1]) / len(y)))
    y = np.array(y.label)
    X = StandardScaler().fit_transform(X)

    myModel = PCA(n_PCA)
    PC = myModel.fit_transform(X)
    X = PC

    data[n_PCA] = {}
    accuracy = GNB.score(X, y)
    RTP_accuracy = GNB.score(X[y == 1], y[y == 1])
    not_RTP_accuracy = GNB.score(X[y == 0], y[y == 0])
    t = 'GNB accuracy and recall'
    plt.title(t, fontsize=16)
    plt.figure(figsize=(16, 9))
    hist_data = {
        "Accuracy": [accuracy],
        "RTP Accuracy": [RTP_accuracy],
        "Not RTP Accuracy": [not_RTP_accuracy]
    }
    sns.barplot(data=pd.DataFrame(data=hist_data))
    plt.tight_layout()
    plt.grid()
    save_photo(save_dir, t, str(n_PCA) + " PCA")
    data[n_PCA] = accuracy
    '''
예제 #2
0
def classifiers(dataset_path, data, seconds, save_dir):
    warnings.filterwarnings("ignore")
    from sklearn import svm
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import DBSCAN
    from sklearn.decomposition import PCA
    from sklearn.neighbors import NearestNeighbors
    import seaborn as sns
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import ShuffleSplit

    dataset = pd.read_csv(dataset_path)
    colors = ['r', 'b']
    classes = [0, 1]  # 0 = Not RTP and 1 = RTP
    columns = dataset.columns[:-1]
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    y[y.label == 'RTP'] = 1
    y[y.label == 'Not RTP'] = 0
    y = np.array(y.label)
    X = StandardScaler().fit_transform(X)
    print("dataset shape: " + str(dataset.shape))
    print("dataset classes distribution: %.2f%% RTP" %
          (100 * len(y[y == 1]) / len(y)))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1)

    # #############################################################################
    # Correlation Matrix
    # #############################################################################
    df_corr = pd.DataFrame(data=np.column_stack((X, y)),
                           columns=dataset.columns)
    corr = df_corr.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    plt.figure(figsize=(10, 10))
    ax = sns.heatmap(corr,
                     xticklabels=corr.columns.values,
                     yticklabels=corr.columns.values,
                     annot=True,
                     cmap="YlGnBu",
                     mask=mask)
    t = "Correlation matrix"
    save_photo(save_dir, t, str(seconds) + "s")

    # #############################################################################
    # Plot Data Using PCA
    # #############################################################################
    myModel = PCA(2)
    PC = myModel.fit_transform(X)
    principalDf = pd.DataFrame(
        data=np.column_stack((PC[:, 0:2], y)),
        columns=['principal component 1', 'principal component 2', 'label'])
    fig = plt.figure(figsize=(13, 13))
    #plt.scatter(principalDf.iloc[:, 0], principalDf.iloc[:, 1])
    for target, color in zip(classes, colors):
        indicesToKeep = principalDf['label'] == target
        plt.scatter(principalDf.loc[indicesToKeep, 'principal component 1'],
                    principalDf.loc[indicesToKeep, 'principal component 2'],
                    c=color,
                    s=40)
    plt.xlabel('Principal Component 1', fontsize=15)
    plt.ylabel('Principal Component 2', fontsize=15, labelpad=-10)
    plt.legend(classes)
    plt.grid()
    t = "Data Plotting using PCA"
    plt.title(t, fontsize=10)
    save_photo(save_dir, t, str(seconds) + "s")

    # #############################################################################
    # Feature characterization
    # #############################################################################
    labels = ['Not RTP', 'RTP']
    for feature in columns:
        for label in labels:
            if 'interarrival' in feature: color = 'r'
            elif 'len_udp' in feature: color = '#815EA4'
            elif "interlength" in feature: color = '#1E8449'
            elif "rtp_inter" in feature: color = 'c'
            elif "kbps" in feature: color = '#A82828'
            elif "num_packets" in feature: color = '#6C3483'

            plt.figure(figsize=(13, 8))
            plt.grid()
            dataset[dataset.label == label][feature].hist(bins=50,
                                                          density=True,
                                                          color=color)
            t = feature + ' hist ' + label
            plt.title(t, fontsize=20)
            plt.tight_layout()
            save_photo(save_dir, t, str(seconds) + "s")

            plt.figure(figsize=(13, 8))
            xplot, yplot = ecdf(dataset[dataset.label == label][feature])
            plt.plot(xplot, yplot, lw=3, color=color)
            plt.grid()
            t = feature + ' CDF ' + label
            plt.title(t, fontsize=20)
            plt.tight_layout()
            save_photo(save_dir, t, str(seconds) + "s")

    # #############################################################################
    # SVM
    # #############################################################################
    model = svm.SVC()
    C = [0.01, 0.1, 1, 10, 100, 1000]
    kernel = ['rbf']
    gamma = [0.001, 0.01, 0.1, 1, 10, 100]
    params = {'C': C, 'kernel': kernel, 'gamma': gamma}
    SVM = GridSearchCV(model, params, cv=10, n_jobs=-1, iid=True)
    SVM.fit(X_train, y_train)
    t = "SVM Learning Curve"
    plot_learning_curve(SVM.best_estimator_,
                        t,
                        X_train,
                        y_train,
                        ylim=(0.0, 1.10),
                        cv=cv,
                        n_jobs=-1)
    save_photo(save_dir, t, str(seconds) + "s")

    # #############################################################################
    # Random Forest
    # #############################################################################
    model = RandomForestClassifier()
    max_features = [3, 5, 7]
    n_estimators = [100, 200, 500, 1000, 2000]
    param_grid = {'max_features': max_features, 'n_estimators': n_estimators}
    RF = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
    RF.fit(X_train, y_train)

    feature_imp = pd.Series(RF.best_estimator_.feature_importances_,
                            index=columns)
    fig = plt.figure(figsize=(13, 13))
    sns.barplot(x=feature_imp, y=feature_imp.index)
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.legend()
    t = "Important Features"
    plt.title(t, fontsize=10)
    save_photo(save_dir, t, str(seconds) + "s")
    t = "RF Learning Curve"
    plot_learning_curve(RF.best_estimator_,
                        t,
                        X_train,
                        y_train,
                        ylim=(0.0, 1.10),
                        cv=cv,
                        n_jobs=-1)
    save_photo(save_dir, t, str(seconds) + "s")

    # #############################################################################
    # KNN
    # #############################################################################
    model = KNeighborsClassifier()
    metric = ["manhattan", "euclidean", "chebyshev"]
    weights = ['uniform', 'distance']
    params = {
        "metric": metric,
        'weights': weights,
        'n_neighbors': range(1, 40)
    }
    KNN = GridSearchCV(model, params, cv=10, n_jobs=-1)
    KNN.fit(X_train, y_train)
    t = "KNN Learning Curve"
    plot_learning_curve(KNN.best_estimator_,
                        t,
                        X_train,
                        y_train,
                        ylim=(0.0, 1.10),
                        cv=cv,
                        n_jobs=-1)
    save_photo(save_dir, t, str(seconds) + "s")

    # #############################################################################
    # Test Set composed by only Not RTP packets
    # #############################################################################
    X = X_test
    y = y_test
    data[seconds] = {}
    for clf, name in zip([SVM, RF, KNN], ['SVM', 'RF', 'KNN']):
        accuracy = clf.best_estimator_.score(X, y)
        RTP_accuracy = clf.best_estimator_.score(X[y == 1], y[y == 1])
        not_RTP_accuracy = clf.best_estimator_.score(X[y == 0], y[y == 0])

        t = name + ' accuracy and recall'
        plt.title(t, fontsize=16)
        plt.figure(figsize=(16, 9))
        hist_data = {
            "Accuracy": [accuracy],
            "RTP Accuracy": [RTP_accuracy],
            "Not RTP Accuracy": [not_RTP_accuracy]
        }
        sns.barplot(data=pd.DataFrame(data=hist_data))
        plt.tight_layout()
        plt.grid()
        save_photo(save_dir, t, str(seconds) + "s")
        data[seconds][name] = accuracy
예제 #3
0
for seconds in range(1, 11):
    seconds_samples = str(seconds) + "s"
    pm = pcap_manager(seconds_samples)
    print("Building datasets with seconds_samples = " + seconds_samples)
    pm.merge_pcap(train_dir)
    pm.merge_pcap(test_dir)
    classifiers(data, seconds, save_dir)
    print()

columns = ["SVM", "RF", "KNN", "GNB"]
df = pd.DataFrame(columns=columns)
for i in data:
    df = df.append(
        {
            "SVM": data[i]["SVM"],
            "RF": data[i]["RF"],
            "KNN": data[i]["KNN"],
            "GNB": data[i]["GNB"]
        },
        ignore_index=True)
df.index += 1
plt.figure(figsize=(20, 16))
plt.plot(df)
plt.xlabel("Seconds")
plt.ylabel("Accuracy")
plt.legend(columns, fontsize=16)
plt.tight_layout()
t = "Window size analysis"
plt.title(t, fontsize=16)
save_photo(save_dir, t)
예제 #4
0
def classifiers(data, seconds, save_dir):
    import pandas as pd
    import numpy as np
    from sklearn import svm
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import DBSCAN
    from sklearn.decomposition import PCA
    from sklearn.neighbors import NearestNeighbors
    import seaborn as sns
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    
    dataset = pd.read_csv("/home/det_tesi/sgarofalo/OneClassClassifier analysis/train_dataset/dataset.csv")
    colors = ['r', 'b']
    classes_str = ["Non RTP", "RTP"]
    classes = [-1, 1] # -1 = Not RTP and 1 = RTP
    #dataset = dataset[dataset.label == 'RTP'] #ONECLASSONLY CLASSIFIER
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    y[y.label == 'RTP'] = 1
    y[y.label == 'Not RTP'] = -1
    y = np.array(y.label)
    X = StandardScaler().fit_transform(X)
    print("Classifing with " + str(seconds) +" window size..")
    print("Training set classes distribution: %.2f%% RTP" % (100*len(y[y == 1])/len(y)))

    # #############################################################################
    # Correlation Matrix
    # #############################################################################
    df_corr = pd.DataFrame(data=np.column_stack((X, y)), columns = dataset.columns)
    corr = df_corr.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    plt.figure(figsize = (10,10))
    ax = sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values,
                annot = True, cmap="YlGnBu", mask = mask)
    t = "Correlation matrix"
    save_photo(save_dir, t, str(seconds)+"s")
    
    # #############################################################################
    # Plot Data Using PCA
    # #############################################################################
    myModel = PCA(2)
    PC = myModel.fit_transform(X)
    # print ("%.2f%% variance ratio with 2 PC" % (100*sum(myModel.explained_variance_ratio_)))
    principalDf = pd.DataFrame(data = np.column_stack((PC[:, 0:2], y)), columns = ['principal component 1', 'principal component 2', 'label'])
    fig = plt.figure(figsize = (13,13))
    #plt.scatter(principalDf.iloc[:, 0], principalDf.iloc[:, 1])
    for target, color in zip(classes, colors):
        indicesToKeep = principalDf['label'] == target
        plt.scatter(principalDf.loc[indicesToKeep, 'principal component 1']
                   , principalDf.loc[indicesToKeep, 'principal component 2']
                   , c = color, s = 40)
    plt.xlabel('Principal Component 1', fontsize = 15)
    plt.ylabel('Principal Component 2', fontsize = 15, labelpad = -10)
    plt.legend(classes_str)
    plt.grid()
    t = "Data Plotting using PCA"
    plt.title(t, fontsize = 10)
    save_photo(save_dir, t, str(seconds)+"s")
    
    # #############################################################################
    # One-class SVM
    # #############################################################################
    #SVM = svm.OneClassSVM(gamma = 'scale')
    #SVM.fit(X)    
    model = svm.OneClassSVM()
    nu = [0.1, 0.3, 0.5, 0.7, 0.9]
    kernel = ['rbf', 'poly']
    gamma = [0.001, 0.01, 0.1, 1, 10]
    params = {'kernel': kernel, 'gamma': gamma, 'nu': nu}
    SVM = GridSearchCV(model, params, cv=10, n_jobs=-1, iid=True, scoring='recall')
    SVM.fit(X, y)
    print('best score: %f' % (SVM.best_score_))
    SVM = SVM.best_estimator_
    
    # #############################################################################
    # Test Set
    # #############################################################################
    print()
    dataset = pd.read_csv("/home/det_tesi/sgarofalo/OneClassClassifier analysis/test_dataset/dataset.csv")
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    y[y.label == 'RTP'] = 1
    y[y.label == 'Not RTP'] = -1
    y = np.array(y.label)
    X = StandardScaler().fit_transform(X)
    print("test_dataset shape: " + str(dataset.shape))
    from sklearn.metrics import accuracy_score
    SVM_accuracy = accuracy_score(y, SVM.predict(X))
    SVM_RTP_accuracy = accuracy_score(y[y == 1], SVM.predict(X[y == 1]))
    SVM_non_RTP_accuracy = accuracy_score(y[y == -1], SVM.predict(X[y == -1]))
    print("SVM accuracy => %.2f%%" % (100*SVM_accuracy))
    print("SVM accuracy on RTP => %.2f%%" % (100*SVM_RTP_accuracy))
    print("SVM accuracy on Non-RTP => %.2f%%" % (100*SVM_non_RTP_accuracy))
    print()
    
    t = 'Accuracy and Recall'
    plt.title(t, fontsize=16)
    plt.figure(figsize=(16, 9))
    hist_data = {"Accuracy": [SVM_accuracy], 
                 "RTP Accuracy": [SVM_RTP_accuracy],
                 "Not RTP Accuracy": [SVM_non_RTP_accuracy]}
    sns.barplot(data = pd.DataFrame(data=hist_data))
    plt.tight_layout()
    plt.grid()
    save_photo(save_dir, t, str(seconds)+"s")
    
    data[seconds] = {}
    data[seconds]["SVM"] = SVM_accuracy