def set_labels(self, data, min_values, max_values, learning_data, fuzzy):

        data['labels'] = pd.Series(fuzzy.labels_)
        for i in range(0, len(data.values)):
            for j in range(len(max_values)):
                x, y, z = data.values[i]
                max1, max2 = max_values[j]
                min1, min2 = min_values[j]

                if x > min1 and x < max1 and y > min2 and y < max2:
                    data.at[i, 'labels'] = j
                if x > max1 and y > max2:
                    data.at[i, 'labels'] = np.max(data['labels'])

        score = ss(data[['param_1', 'param_2']], labels=data['labels'])
        return score
def run_trial(X, labels, k):
    errors = '"'

    # Run our dbscan
    start = time()
    """
    if metric == 'seuclidean':
        db = KMeans(eps,minPts,metric=metric,metric_params={'V':V})
    else:
        db = kmean(,minPts,metric=metric)
    """
    db = KMeans(k, n_jobs=12)
    pred_labels = db.fit_predict(X)
    elapsed = time() - start

    try:
        ari_score = ari(pred_labels, labels)
    except Exception as e:
        errors += str(e) + '; '
        ari_score = np.nan
    try:
        nmi_score = nmi(pred_labels, labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nmi_score = np.nan
    try:
        ss_score = ss(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        ss_score = np.nan
    try:
        vrc_score = vrc(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        vrc_score = np.nan
    try:
        dbs_score = dbs(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        dbs_score = np.nan

    errors += '"'

    return [
        k, elapsed, ari_score, nmi_score, ss_score, vrc_score, dbs_score,
        errors
    ]
    with open(sys.argv[1], 'rb') as fh:
        x = pickle.load(fh)
        to_remove = []
        for i in range(len(x.cell_type)):
            if x.cell_type[i] in EXCLUDED_TYPES:
                to_remove.append(i)
        x.drop(index=to_remove, inplace=True)
        X = x.drop('cell_type', axis=1).values
        if scipy.sparse.issparse(X[0][0]):
            X = np.array(np.concatenate([i[0].todense() for i in X]))

    labels = LabelEncoder().fit_transform(x.cell_type)

    try:
        ss_euc = str(ss(X, labels, metric='euclidean'))
    except Exception as e:
        print(e)
        ss_euc = str(np.nan)

    try:
        ss_seu = str(
            ss(X,
               labels,
               metric='seuclidean',
               V=np.var(X, axis=0, ddof=1, dtype=np.double)))
    except Exception as e:
        print(e)
        ss_seu = str(np.nan)

    try:
data_pr = data[["param_1", "param_2"]]

plt.scatter(data_pr.param_1, data_pr.param_2)

kmeans = km(init='k-means++', n_clusters=3, random_state=0).fit(data_pr.as_matrix())
# data_pr['labels'] =pd.Series(kmeans.labels_)
# data_pr.plot.scatter(x='b',y='c',c='labels', colormap='viridis')
data_pr1 = data_pr.copy()
scores = []

for k in range(2, 10):
    kmeans = km(init='k-means++', n_clusters=k, random_state=0).fit(data_pr.as_matrix())
    data_pr1['labels'] = pd.Series(kmeans.labels_)
    print(len(kmeans.labels_))
    data_pr1.plot.scatter(x='b', y='c', c='labels', colormap='viridis')
    scores.append(ss(data_pr1[['b', 'c']], labels=data_pr1['labels']))

print(data_pr1)
n = [i for i in range(2, 10)]
plt.figure()
plt.plot(n, scores)
plt.show()

# zmiana txt do csv

# param1 = list()
# param2 = list()
# depth = list()
# all_data = list()
#
# filepath = "files\\sdmt3.txt"
示例#5
0
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(faultsX)
    gmm.fit(faultsX)

    #Faults dataset
    #Visual Measurements
    #Sum of Squared Errors for K-means
    SSE[k]['Faults'] = km.score(faultsX)

    #Log-Likelihood for GMM
    ll[k]['Faults'] = gmm.score(faultsX)

    #Silhouette Score
    #The best value is 1 and the worst value is -1. Silhouette analysis can be used to study the separation distance between the resulting clusters.
    SS[k]['Faults']['Kmeans'] = ss(faultsX, km.predict(faultsX))
    SS[k]['Faults']['GMM'] = ss(faultsX, gmm.predict(faultsX))
    #Cluster Accuracy
    acc[k]['Faults']['Kmeans'] = cluster_acc(faultsY, km.predict(faultsX))
    acc[k]['Faults']['GMM'] = cluster_acc(faultsY, gmm.predict(faultsX))

    #Adjusted Mutual Information
    adjMI[k]['Faults']['Kmeans'] = ami(faultsY, km.predict(faultsX))
    adjMI[k]['Faults']['GMM'] = ami(faultsY, gmm.predict(faultsX))

    #Breast Cancer dataset
    km.fit(bcX)
    gmm.fit(bcX)
    SSE[k]['BreastC'] = km.score(bcX)
    ll[k]['BreastC'] = gmm.score(bcX)
    SS[k]['BreastC']['Kmeans'] = ss(bcX, km.predict(bcX))
示例#6
0
def run_clustering(out, cancer_x, cancer_y, housing_x, housing_y):
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    silhouette = defaultdict(lambda: defaultdict(dict))
    completeness = defaultdict(lambda: defaultdict(dict))
    homogeniety = defaultdict(lambda: defaultdict(dict))

    st = clock()
    for k in range(2, 20, 1):
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(cancer_x)
        gmm.fit(cancer_x)

        SSE[k]['cancer'] = km.score(cancer_x)
        ll[k]['cancer'] = gmm.score(cancer_x)

        acc[k]['cancer']['Kmeans'] = cluster_acc(cancer_y,
                                                 km.predict(cancer_x))
        acc[k]['cancer']['GMM'] = cluster_acc(cancer_y, gmm.predict(cancer_x))

        adjMI[k]['cancer']['Kmeans'] = ami(cancer_y, km.predict(cancer_x))
        adjMI[k]['cancer']['GMM'] = ami(cancer_y, gmm.predict(cancer_x))

        silhouette[k]['cancer']['Kmeans Silhouette'] = ss(
            cancer_x, km.predict(cancer_x))
        silhouette[k]['cancer']['GMM Silhouette'] = ss(cancer_x,
                                                       gmm.predict(cancer_x))

        completeness[k]['cancer']['Kmeans Completeness'] = cs(
            cancer_y, km.predict(cancer_x))
        completeness[k]['cancer']['GMM Completeness'] = cs(
            cancer_y, gmm.predict(cancer_x))

        homogeniety[k]['cancer']['Kmeans Homogeniety'] = hs(
            cancer_y, km.predict(cancer_x))
        homogeniety[k]['cancer']['GMM Homogeniety'] = hs(
            cancer_y, gmm.predict(cancer_x))

        km.fit(housing_x)
        gmm.fit(housing_x)
        SSE[k]['housing'] = km.score(housing_x)
        ll[k]['housing'] = gmm.score(housing_x)

        acc[k]['housing']['Kmeans'] = cluster_acc(housing_y,
                                                  km.predict(housing_x))
        acc[k]['housing']['GMM'] = cluster_acc(housing_y,
                                               gmm.predict(housing_x))

        adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x))
        adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))

        silhouette[k]['housing']['Kmeans Silhouette'] = ss(
            housing_x, km.predict(housing_x))
        silhouette[k]['housing']['GMM Silhouette'] = ss(
            housing_x, gmm.predict(housing_x))

        completeness[k]['housing']['Kmeans Completeness'] = cs(
            housing_y, km.predict(housing_x))
        completeness[k]['housing']['GMM Completeness'] = cs(
            housing_y, gmm.predict(housing_x))

        homogeniety[k]['housing']['Kmeans Homogeniety'] = hs(
            housing_y, km.predict(housing_x))
        homogeniety[k]['housing']['GMM Homogeniety'] = hs(
            housing_y, gmm.predict(housing_x))

        print(k, clock() - st)
    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)

    adjMI = pd.Panel(adjMI)

    silhouette = pd.Panel(silhouette)
    completeness = pd.Panel(completeness)
    homogeniety = pd.Panel(homogeniety)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv')
    acc.ix[:, :, 'cancer'].to_csv(out + 'Perm acc.csv')

    adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv')
    adjMI.ix[:, :, 'cancer'].to_csv(out + 'Perm adjMI.csv')

    silhouette.ix[:, :, 'cancer'].to_csv(out + 'Perm silhouette.csv')
    completeness.ix[:, :, 'cancer'].to_csv(out + 'Perm completeness.csv')
    homogeniety.ix[:, :, 'cancer'].to_csv(out + 'Perm homogeniety.csv')

    silhouette.ix[:, :, 'housing'].to_csv(out + 'housing silhouette.csv')
    completeness.ix[:, :, 'housing'].to_csv(out + 'housing completeness.csv')
    homogeniety.ix[:, :, 'housing'].to_csv(out + 'housing homogeniety.csv')
示例#7
0
def run_trial(X, labels, eps, minPts, metric, V):
    errors = '"'

    # Run our dbscan
    start = time()
    if metric == 'seuclidean':
        db = DBSCAN(eps,
                    minPts,
                    metric=metric,
                    metric_params={'V': V},
                    n_jobs=6)
    else:
        db = DBSCAN(eps, minPts, metric=metric, n_jobs=6)
    pred_labels = db.fit_predict(X)
    elapsed = time() - start
    perc_noise = np.sum(pred_labels == -1) / len(pred_labels)
    n_clust = pred_labels.max()

    # Remove noisy points
    clean_idx = np.where(pred_labels != -1)
    nn_preds = pred_labels[clean_idx]
    nn_labels = labels[clean_idx]
    nn_X = X[clean_idx]

    try:
        ari_score = ari(pred_labels, labels)
    except Exception as e:
        errors += str(e) + '; '
        ari_score = np.nan
    try:
        nmi_score = nmi(pred_labels, labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nmi_score = np.nan
    try:
        if metric == 'seuclidean':
            ss_score = ss(X, pred_labels, metric=metric, V=V)
        else:
            ss_score = ss(X, pred_labels, metric=metric)
    except Exception as e:
        errors += str(e) + '; '
        ss_score = np.nan
    try:
        vrc_score = vrc(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        vrc_score = np.nan
    try:
        dbs_score = dbs(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        dbs_score = np.nan

    try:
        nn_ari_score = ari(nn_preds, nn_labels)
    except Exception as e:
        errors += str(e) + '; '
        nn_ari_score = np.nan
    try:
        nn_nmi_score = nmi(nn_preds, nn_labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nn_nmi_score = np.nan
    try:
        if metric == 'seuclidean':
            nn_ss_score = ss(nn_X, nn_preds, metric=metric, V=V)
        else:
            nn_ss_score = ss(nn_X, nn_preds, metric=metric)
    except Exception as e:
        errors += str(e) + '; '
        nn_ss_score = np.nan
    try:
        nn_vrc_score = vrc(nn_X, nn_preds)
    except Exception as e:
        errors += str(e) + '; '
        nn_vrc_score = np.nan
    try:
        nn_dbs_score = dbs(nn_X, nn_preds)
    except Exception as e:
        errors += str(e) + '; '
        nn_dbs_score = np.nan

    errors += '"'

    return [
        metric, eps, minPts, n_clust, perc_noise, elapsed, ari_score,
        nn_ari_score, nmi_score, nn_nmi_score, ss_score, nn_ss_score,
        vrc_score, nn_vrc_score, dbs_score, nn_dbs_score, errors
    ]
 def cluster_quality(self, data, labels):
     data['labels'] = pd.Series(labels)
     score = ss(data[['param_1', 'param_2']], labels=data['labels'])
     return score
    def sklearn(self):
        all_data = list()

        for x in range(0, len(data_handler_1.param1)):
            all_data.append(
                list([data_handler_1.param1[x], data_handler_1.param2[x]]))

        data = np.array(all_data)
        datapd = pd.DataFrame(data)
        scores = []

        # for k in range(2, 10):
        k = 4
        fuzzy_kmeans = FuzzyKMeans(k=k, m=4, max_iter=300)
        fuzzy_kmeans.fit(datapd)
        datapd['labels'] = pd.Series(fuzzy_kmeans.labels_)
        score = ss(datapd[[0, 1]], labels=datapd['labels'])
        scores.append(score)
        #
        # datapd.plot.scatter(x=0, y=1, c='labels', colormap='viridis')
        # plt.xlabel("Param 1")
        # plt.ylabel("Param2")
        # plt.title(f'K = {k}, Silhouette score = {score}')

        for center in fuzzy_kmeans.cluster_centers_:
            plt.plot(center[0], center[1], 'ro')

        # for datapd['labels']
        group_data = datapd.groupby(pd.Series(fuzzy_kmeans.labels_),
                                    group_keys=datapd['labels'].unique())
        # print(datapd)
        # print(group_data[1].get_group(1))

        # n = [i for i in range(2, 10)]
        # plt.figure()
        # plt.plot(n, scores)
        # plt.xlabel("K")
        # plt.ylabel("Silhouette score")
        # plt.show()

        second_data = DataHandler()
        second_data.open_file("files\\sdmt3.txt")
        second = list()

        for x in range(0, len(data_handler_1.param1)):
            second.append(
                list([data_handler_1.param1[x], data_handler_1.param2[x]]))

        sec_data = np.array(second)
        sec_datapd = pd.DataFrame(sec_data)
        scores = []
        # print(second_data.param1)
        max_values = []
        min_values = []
        for group_label in datapd['labels'].unique():
            max_param1 = np.max(group_data[0].get_group(group_label))
            max_param2 = np.max(group_data[1].get_group(group_label))
            min_param1 = np.min(group_data[0].get_group(group_label))
            min_param2 = np.min(group_data[1].get_group(group_label))
            tuple_max = (max_param1, max_param2)
            tuple_min = (min_param1, min_param2)
            max_values.append(tuple_max)
            min_values.append(tuple_min)
        # print(max_values)
        # print(min_values)
        # print(sec_datapd)

        sec_datapd['labels'] = pd.Series(fuzzy_kmeans.labels_)

        sec_datapd.set_value(2, 'labels', 50)

        print(sec_datapd)
        for i in range(0, 40):
            for j in range(len(max_values)):
                x, y, z = sec_datapd.values[i]
                max1, max2 = max_values[j]
                min1, min2 = min_values[j]
                # print(x,y)
                if x > min1 and x < max1 and y > min2 and y < max2:
                    sec_datapd.set_value(i, 'labels', j)

        sec_score = ss(sec_datapd[[0, 1]], labels=sec_datapd['labels'])

        print(sec_datapd)
        sec_datapd.plot.scatter(x=0, y=1, c='labels', colormap='viridis')
        plt.xlabel("Param 1")
        plt.ylabel("Param2")
        plt.title(f'K = {k}, Silhouette score = {sec_score}')
        plt.show()
### please run entire batch of below code at once
## Plotting Elbow and Silhoutte plot
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(16, 6)

clusters = np.arange(2, 11, 1)
dists = []
means = []
stdevs = []

for l in clusters:
    km = KMeans(n_clusters=l)
    labels = km.fit_predict(r)

    ss_vals = ss(r, labels, metric="euclidean")
    ss_avg = np.mean(ss_vals)
    ss_std = np.std(ss_vals)

    dists.append(km.inertia_)
    means.append(ss_avg)
    stdevs.append(ss_std)

ax1.plot(clusters, dists, c="black", linestyle=":", marker="+", markersize=10)

ax1.set_xlabel("k")
ax1.set_ylabel("Distortion (Within-Cluster SSE)")
ax1.tick_params(axis='y', which='both', left=False, labelleft=False)
ax1.set_title("Elbow Method")

ax2.scatter(clusters, means, c="black")