Exemplo n.º 1
0
def cardio_re():
    epss = [3, 4, 5]
    for e in epss:

        mat = loadmat('../data/cardio.mat')
        X_car = mat['X']
        y = pd.DataFrame(mat['y'])
        y = y[0].to_numpy()
        X_car = pd.DataFrame(X_car)
        print(X_car.head(12))
        print(y)
        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = dbscan.dbscanp(X_car, 21, eps=e, minpts=10, factor=1)
        # print(clusterlmat[0][13])
        y_t = clusterlmat[0][21].to_numpy()
        print(y_t)

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
        print(y_t)

        print(f1_score(y, y_t, average='weighted'))
        print(assess.falsealarmrate(y, [0], y_t, 1))
        print(adjusted_rand_score(y, y_t))
        print(jaccard_score(y, y_t))
        print(e)
Exemplo n.º 2
0
def pima_re():
    epss = [(45, 0.2, 0.9), (35, 0.35, 2), (25, 0.2, 0.0005)]
    for t in epss:

        data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None)
        print(data.head(12))
        y = data[8].to_numpy()
        print(y)
        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = dbscan.dbscanp(
            data,
            8,
            eps=t[2],
            minpts=t[0],
            factor=t[1],
            initialization=dbscan.Initialization.UNIFORM,
            plot=True)
        # print(clusterlmat[0][13])
        y_t = clusterlmat[0][9].to_numpy()
        # print(y_t)

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
        print(y_t)

        print(f1_score(y, y_t, average='weighted'))
        print(assess.falsealarmrate(y, [0], y_t, 1))
        print(adjusted_rand_score(y, y_t))
        print(jaccard_score(y, y_t))
        print(t[2])
Exemplo n.º 3
0
def pima_pre_norm_and_pca_re_dbm():
    epss = [(260, 0.5, 0.5), (260, 0.5, 0.4), (260, 0.5, 0.35),
            (260, 0.5, 0.3), (260, 0.5, 0.2), (260, 0.5, 0.1)]
    epss2 = [(268, 0.5, 0.5), (268, 0.5, 0.4), (268, 0.5, 0.35),
             (268, 0.5, 0.3), (268, 0.5, 0.2), (268, 0.5, 0.1)]
    epss3 = [(280, 0.5, 0.5), (280, 0.5, 0.4), (280, 0.5, 0.35),
             (280, 0.5, 0.3), (280, 0.5, 0.2), (280, 0.5, 0.1)]
    for t in epss:

        data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None)
        # print(data.head(12))
        y = data[8].to_numpy()
        print(y)
        scaler = MinMaxScaler()
        arr_scaled = scaler.fit_transform(data)
        data2 = pd.DataFrame(arr_scaled)
        pca = PCA(n_components=7)
        principalcomponents = pca.fit_transform(data2.iloc[:, 0:8])
        principledf = pd.DataFrame(data=principalcomponents)
        # print(principledf.head(12))

        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = dbscanppm.dbscanmp(
            principledf,
            7,
            eps=t[2],
            minpts=t[0],
            factor=t[1],
            threshold=0.06,
            initialization=dbscan.Initialization.NONE,
            plot=False)
        # print(clusterlmat[0][13])
        y_t = clusterlmat[0][7].to_numpy()
        # print(y_t)

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
        print("cluster labels:", y_t)

        print("eps: ", t[2])
        f_sc = f1_score(y, y_t, average='weighted')
        fa = assess.falsealarmrate(y, [0], y_t, 1)
        ard = adjusted_rand_score(y, y_t)
        js = jaccard_score(y, y_t)

        print(f_sc)
        print(fa)
        print(ard)
        print(js)
        print(t[2])

        rr = [t[0], t[2], t[1], f_sc, fa, ard, js, dbscan.Initialization.NONE]
        with open('../data/pima_pca/dbscanm.pima.pca.result.csv', 'a') as fd:
            writer = csv.writer(fd)
            writer.writerow(rr)
Exemplo n.º 4
0
def glass_re():
    epss = [0.4, 0.9, 1, 1.2, 2]
    for e in epss:

        mat = loadmat('../data/glass.mat')
        X_car = mat['X']
        y = pd.DataFrame(mat['y'])
        y = y[0].to_numpy()
        X_car = pd.DataFrame(X_car)
        print(X_car.head(12))
        print(y)
        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = dbscan.dbscanp(X_car, 21, eps=e, minpts=8, factor=0.5, initialization=dbscan.Initialization.KCENTRE)
        # print(clusterlmat[0][13])
        y_t = clusterlmat[0][9].to_numpy()
        # print(y_t)

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
        print(y_t)

        print(f1_score(y, y_t, average='weighted'))
        print(assess.falsealarmrate(y, [0], y_t, 1))
        print(adjusted_rand_score(y, y_t))
        print(jaccard_score(y, y_t))
        print(e)
def main():
    mat = loadmat('../data/cardio.mat')
    print(mat)
    X_car = mat['X']

    y_car = mat['y']
    y_car = pd.DataFrame(y_car)
    # print(list(y_car[0]))
    # y_car.hist()
    plt.title("Cardiotocography Data Class Distribution")
    plt.show()
    data = pd.DataFrame(X_car)

    # classes 2, 3, 6, 7 has the least number of frequency. Thus we will use those classes as the outliers.

    clustering = DBSCAN(eps=45, min_samples=15).fit(data)
    print(clustering.labels_)
    print(type(clustering.labels_))
    print(np.array(y_car[0]))
    # print(type(clustering.labels_))

    copy_clusterings = clustering.labels_.copy()
    index = 0
    for i in copy_clusterings:
        if i == -1:
            copy_clusterings[index] = 1
        else:
            copy_clusterings[index] = 0
        index += 1

    print('false alarm rate: ',
          metrics.falsealarmrate(np.array(y_car[0]), [1], copy_clusterings, 1))
    print(jaccard_score(np.array(y_car[0]), copy_clusterings, average=None))
Exemplo n.º 6
0
def pima_pre_norm_and_pca_kmeans_re():
    epss = [(1, 250, 50, 0.0001), (1, 268, 50, 0.0001), (1, 275, 50, 0.0001)]
    epss2 = [(2, 250, 50, 0.0001), (2, 268, 50, 0.0001), (2, 275, 50, 0.0001)]
    epss3 = [(3, 250, 50, 0.0001), (3, 268, 50, 0.0001), (3, 275, 50, 0.0001)]
    for t in epss3:

        data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None)
        # print(data.head(12))
        y = data[8].to_numpy()
        print(y)
        scaler = MinMaxScaler()
        arr_scaled = scaler.fit_transform(data)
        data2 = pd.DataFrame(arr_scaled)
        pca = PCA(n_components=7)
        principalcomponents = pca.fit_transform(data2.iloc[:, 0:8])
        principledf = pd.DataFrame(data=principalcomponents)
        # print(principledf.head(12))

        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = km.kmeansm(principledf, t[0], t[1], t[2], t[3], 7)
        # print(clusterlmat[0][13])
        y_t = clusterlmat
        # print(y_t)

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
        print("cluster labels:", y_t)

        print("eps: ", t[2])
        f_sc = f1_score(y, y_t, average='weighted')
        fa = assess.falsealarmrate(y, [0], y_t, 1)
        ard = adjusted_rand_score(y, y_t)
        js = jaccard_score(y, y_t)

        print(f_sc)
        print(fa)
        print(ard)
        print(js)
        print(t[2])

        rr = [t[0], t[2], t[1], f_sc, fa, ard, js, "KMEANS--"]
        with open('../data/pima_pca/kmeans.pima.pca.result.csv', 'a') as fd:
            writer = csv.writer(fd)
            writer.writerow(rr)
Exemplo n.º 7
0
def main():
    data = pd.read_csv('data/satellite-unsupervised-ad.csv', header=None)
    print(data.head())
    y = data[36]
    y.hist()
    plt.show()
    clustering = DBSCAN(eps=42, min_samples=7).fit(data.iloc[:, 0:35])
    print()
    print(clustering.labels_)
    print(type(clustering.labels_))
    for index, row in data.iterrows():
        print(row[36] + " " + str(clustering.labels_[index]))

    print(
        'false alarm rate: ',
        metrics.falsealarmrate(data.iloc[:, 36].values, 'o',
                               clustering.labels_, -1))
Exemplo n.º 8
0
def pima_pre_norm_re():
    epss = [(45, 0.02, 0.9), (35, 0.0001, 2), (25, 0.2, 0.00005)]
    for t in epss:

        data = pd.read_csv('../data/pimaindiansdiabetes.csv', header=None)
        print(data.head(12))
        y = data[8].to_numpy()
        print(y)
        scaler = MinMaxScaler()
        arr_scaled = scaler.fit_transform(data)
        data2 = pd.DataFrame(arr_scaled)
        # pca = PCA(n_components=7)
        # principalcomponents = pca.fit(data2.iloc[, 0:8])
        # principledf = pd.DataFrame(principalcomponents)

        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = dbscan.dbscanp(data2,
                                     8,
                                     eps=t[2],
                                     minpts=t[0],
                                     factor=t[1],
                                     initialization=dbscan.Initialization.NONE,
                                     plot=False)
        # print(clusterlmat[0][13])
        y_t = clusterlmat[0][9].to_numpy()
        # print(y_t)

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
        print(y_t)

        print(f1_score(y, y_t, average='weighted'))
        print(assess.falsealarmrate(y, [0], y_t, 1))
        print(adjusted_rand_score(y, y_t))
        print(jaccard_score(y, y_t))
        print(t[2])
Exemplo n.º 9
0
def main():

    ## Pima Data set
    datapima = pd.read_csv("data/pimaindiansdiabetes.csv")
    col = [
        'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
    ]

    datapima.columns = col
    X = datapima.iloc[:, :-1]
    y = datapima.iloc[:, -1]
    y.hist()
    plt.title(" Pima Indians diabetes Data Class Distribution")
    plt.show()
    clusteringpima = DBSCAN(eps=0.9, min_samples=6).fit(X)
    print(clusteringpima.labels_)
    print(type(clusteringpima.labels_))
    # print(type(clustering.labels_))
    for index, row in datapima.iterrows():
        print(str(row[8]) + ":" + str(clusteringpima.labels_[index]))

        print('false alarm rate: ',
              metrics.falsealarmrate(y, [1], clusteringpima.labels_, -1))
    pima = clusteringpima.labels_
    pima = pima.copy()
    preds = [1 if i == -1 else 0 for i in pima]
    print(jaccard_score(y, preds))
    print(preds)

    ## Cardio Data Set
    # mat = scipy.io.loadmat('data/cardio.mat')
    # X_car = mat['X']
    # y_car=mat['y']
    # y_car= pd.DataFrame(y_car)
    # print(list(y_car[0]))
    # y_car.hist()
    # plt.title("Cardiotocography Data Class Distribution")
    # plt.show()
    # X_car = pd.DataFrame(X_car)
    # clusteringcar = DBSCAN(eps=1.5, min_samples=10).fit(X_car)
    # print(clusteringcar.labels_)
    # print('false alarm rate: ', metrics.falsealarmrate(y_car[0], [1], clusteringcar.labels_, -1))
    # car= clusteringcar.labels_
    # car = car.copy()
    # predcar = [1 if i==-1 else 0 for i in car]
    # print(jaccard_score(y_car[0], predcar))
    # print(predcar)

    ## Wine Data Set
    # win = scipy.io.loadmat('data/wine.mat')
    # print(win)
    # X_win = win['X']
    # y_win=win['y']
    # print(y_win)
    # X_win = pd.DataFrame(X_win)
    # y_win= pd.DataFrame(y_win)
    # y_win.hist()
    # plt.title("Wine Data Class Distribution")
    # plt.show()
    # clusteringwin = DBSCAN(eps=1.5, min_samples=5).fit(X_win)
    # print(clusteringwin.labels_)
    # # print('false alarm rate: ', metrics.falsealarmrate(data.iloc[:, 36].values, 'o', clustering.labels_, -1))
    # win= clusteringwin.labels_
    # win = win.copy()
    # predwin = [ 1 if i==-1 else 0 for i in win]
    # print(predwin)

    ## Glass Data Set
    glass = scipy.io.loadmat('data/glass.mat')
    print(glass)
    X_gla = glass['X']
    y_gla = glass['y']
    X_gla = pd.DataFrame(X_gla)
    y_gla = pd.DataFrame(y_gla)
    y_gla.hist()
    plt.title("Glass Data Class Distribution")
    plt.show()
    clusteringgla = DBSCAN(eps=0.9, min_samples=10, n_jobs=-1).fit(X_gla)
    print(clusteringgla.labels_)
    print('false alarm rate: ',
          metrics.falsealarmrate(y_gla[0], [1], clusteringgla.labels_, -1))
    gla = clusteringgla.labels_
    gla = gla.copy()
    predgla = [1 if i == -1 else 0 for i in gla.copy()]
    print(jaccard_score(y_gla[0], predgla))
    print(predgla)
Exemplo n.º 10
0
def shuttle_re_db_ann_uniform():
    epss = [(4.5, 10, 0.1), (4.8, 10, 0.1), (5, 10, 0.1), (5.3, 10, 0.1),
            (5.5, 10, 0.1), (5.8, 10, 0.1), (6, 10, 0.1), (6.8, 10, 0.1),
            (7, 10, 0.1), (9, 10, 0.1), (10, 10, 0.1), (28, 10, 0.1),
            (28.5, 10, 0.1)]

    # epss = [(6.8, 10, 0.1)]
    for t in epss:
        data = pd.read_csv('../data/shuttle-unsupervised-trn.csv', header=None)
        # datafiltered = data[data[9] != 4]
        # datafiltered = pd.DataFrame(datafiltered)
        # print(datafiltered)
        y = data[9].to_numpy()
        print(y)
        # datafiltered[9].hist()
        # plt.title("shuttle data set histogram")
        # plt.show()
        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = dbann.dbscanann(
            data,
            9,
            eps=t[0],
            minpts=t[1],
            factor=t[2],
            initialization=dbann.Initialization.UNIFORM,
            plot=False)
        # print(clusterlmat[0][13])
        y_t = clusterlmat[0][10].to_numpy()
        identifiedNoisepoints = np.count_nonzero(y_t == -1)
        print("count of noise points", identifiedNoisepoints)
        # print(y_t)

        a = y != 4
        y = y[y != 4]
        y_t = y_t[a]

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
            # print(y_t)
        index = 0
        for i in y.copy():
            if i == 2 or i == 3 or i == 5 or i == 6 or i == 7:
                y[index] = 1
            else:
                y[index] = 0
            index += 1

        f1_scored = f1_score(y, y_t, average='weighted')
        falarm = assess.falsealarmrate(y, [0], y_t, 1)
        arand = adjusted_rand_score(y, y_t)
        jacc = jaccard_score(y, y_t)

        rr = [
            t[1], t[0], t[2], f1_scored, falarm, arand, jacc,
            identifiedNoisepoints
        ]
        with open('../data/ann/dbscan.dbann.uniform.shuttle.result.csv',
                  'a') as fd:
            writer = csv.writer(fd)
            writer.writerow(rr)
Exemplo n.º 11
0
def shuttle_re_kmeans():
    par = [
        (1, 2500),
        (1, 2644),
        (1, 2700),
        (2, 2500),
        (2, 2644),
        (2, 2700),
    ]

    for p in par:
        data = pd.read_csv('../data/shuttle-unsupervised-trn.csv', header=None)
        # datafiltered = data[data[9] != 4]
        # datafiltered = pd.DataFrame(datafiltered)
        # print(datafiltered)
        y = data[9].to_numpy()
        print(y)
        # datafiltered[9].hist()
        # plt.title("shuttle data set histogram")
        # plt.show()
        # clusterlmat = kmeansm(X_car, 2, 176, 10, 0.05, 21)
        clusterlmat = km.kmeansm(data, p[0], p[1], 50, 0.001, 9)
        # print(clusterlmat[0][13])
        y_t = clusterlmat
        identifiedNoisepoints = np.count_nonzero(y_t == -1)
        print("count of noise points", identifiedNoisepoints)
        # print(y_t)

        a = y != 4
        y = y[y != 4]
        y_t = y_t[a]

        index = 0
        for i in y_t.copy():
            if i == -1:
                y_t[index] = 1
            else:
                y_t[index] = 0
            index += 1
            # print(y_t)

        index = 0
        for i in y.copy():
            if i == 2 or i == 3 or i == 5 or i == 6 or i == 7:
                y[index] = 1
            else:
                y[index] = 0
            index += 1
        print("Actual number of outlier: ", np.count_nonzero(y == 1))
        f1_scored = f1_score(y, y_t, average='weighted')
        falarm = assess.falsealarmrate(y, [0], y_t, 1)
        arand = adjusted_rand_score(y, y_t)
        jacc = jaccard_score(y, y_t)

        rr = [
            p[0], p[1], 50, f1_scored, falarm, arand, jacc,
            identifiedNoisepoints
        ]
        with open('../data/ann/kmeans.shuttle.result.csv', 'a') as fd:
            writer = csv.writer(fd)
            writer.writerow(rr)