예제 #1
0
def do_stuff(dataset = None, metric = True, drtype = "mds", components = 2):
    data_for_mds = np.array(dataset)
    
    if drtype:
        if drtype == "mds":
            mds = manifold.MDS(n_components=components, n_init=10, max_iter=3000, dissimilarity="euclidean", n_jobs=1, metric=metric)
            mds_result = mds.fit(data_for_mds)
        elif drtype == "pca":
            pca = PCA(n_components=2)
            mds_result = pca.fit(euclidean_distances(data_for_mds)).transform(data_for_mds)
        elif drtype == "tsne":
            model = manifold.TSNE(n_components=2, random_state=0, learning_rate=1000, early_exaggeration=10.0)
            mds_result = model.fit_transform(data_for_mds)
    
    clusterings = {}
    for i in range(10, 1, -1):
        clustering = ac(n_clusters=i, memory=mkdtemp())
        clusterings[i] = clustering.fit(data_for_mds).labels_.tolist()
        
    clustering = ac(n_clusters=1, memory=mkdtemp())
    clustering.fit(data_for_mds)
    
    output = {
        "drInfo": None,
        "embedding": None,
        "clustering": {
            "tree": clustering.children_.tolist(),
            "labels": clusterings
        }
    }
    if drtype:
        median_distance = False
        stress1 = False
        raw_stress = False
        if drtype == "mds":
            raw_stress =  mds_result.stress_
            disparities = euclidean_distances(data_for_mds)
            disparityHalfMatrix = np.triu(disparities)
            sumSquaredDisparities = np.sum(np.square(disparityHalfMatrix))
            stress1 = math.sqrt(mds_result.stress_ / sumSquaredDisparities)
            median_distance = np.median(euclidean_distances(mds_result.embedding_))
            embedding = mds_result.embedding_.tolist()
            print mds_result.stress_
        else:
            embedding = mds_result.tolist()
        output["drInfo"] = {
            "type": drtype,
            "metric": metric,
            "components": components,
            "stress1": stress1,
            "rawStress":raw_stress,
            "medianDistance": median_distance
        }
        output["embedding"] = embedding

    return output
예제 #2
0
def do_stuff(dataset = None, metric = True, drtype = "mds", components = 2):
    data_for_mds = np.array(dataset)
    
    if drtype:
        if drtype == "mds":
            mds = manifold.MDS(n_components=components, n_init=10, max_iter=3000, dissimilarity="euclidean", n_jobs=1, metric=metric)
            mds_result = mds.fit(data_for_mds)
        elif drtype == "pca":
            pca = PCA(n_components=2)
            mds_result = pca.fit(euclidean_distances(data_for_mds)).transform(data_for_mds)
        elif drtype == "tsne":
            model = manifold.TSNE(n_components=2, random_state=0, learning_rate=1000, early_exaggeration=10.0)
            mds_result = model.fit_transform(data_for_mds)
    
    clusterings = {}
    for i in range(10, 1, -1):
        clustering = ac(n_clusters=i, memory=mkdtemp())
        clusterings[i] = clustering.fit(data_for_mds).labels_.tolist()
        
    clustering = ac(n_clusters=1, memory=mkdtemp())
    clustering.fit(data_for_mds)
    
    output = {
        "drInfo": None,
        "embedding": None,
        "clustering": {
            "tree": clustering.children_.tolist(),
            "labels": clusterings
        }
    }
    if drtype:
        median_distance = False
        stress1 = False
        raw_stress = False
        if drtype == "mds":
            raw_stress =  mds_result.stress_
            disparities = euclidean_distances(data_for_mds)
            disparityHalfMatrix = np.triu(disparities)
            sumSquaredDisparities = np.sum(np.square(disparityHalfMatrix))
            stress1 = math.sqrt(mds_result.stress_ / sumSquaredDisparities)
            median_distance = np.median(euclidean_distances(mds_result.embedding_))
            embedding = mds_result.embedding_.tolist()
            print(mds_result.stress_)
        else:
            embedding = mds_result.tolist()
        output["drInfo"] = {
            "type": drtype,
            "metric": metric,
            "components": components,
            "stress1": stress1,
            "rawStress":raw_stress,
            "medianDistance": median_distance
        }
        output["embedding"] = embedding

    return output
예제 #3
0
파일: hac.py 프로젝트: nserr/SENG474
def average_linkage(dataset):
    path = "data/" + dataset + ".csv"
    df = pd.read_csv(path)

    if dataset == 'dataset1':
        x = df.iloc[:, [0, 1]].values
        d = '2D'
    else:
        x = df.iloc[:, [0, 1, 2]].values
        d = '3D'
    
    hac = ac(n_clusters = None, distance_threshold = 1, linkage = 'average')
    hac.fit(x)

    linkages = create_dendogram(hac)
    dendrogram(linkages, truncate_mode = 'lastp')

    plt.title("Dendogram for Average Linkage HAC with " + dataset)
    filename = "averageHAC" + d + "_dendogram"
    plt.savefig(filename)
    plt.clf()

    if d == '2D':
        generate_2D_plot(x, 3, 'average')
    else:
        generate_3D_plot(x, 26, 'average')
예제 #4
0
파일: hac.py 프로젝트: nserr/SENG474
def generate_2D_plot(x, clusters, linkage):
    hac = ac(n_clusters = clusters, affinity = 'euclidean', linkage = linkage)
    hac.fit_predict(x)

    plt.title("Cluster Map for " + linkage.capitalize() + " Linkage HAC with dataset1")
    plt.scatter(x[:, 0], x[:, 1], c = hac.labels_)
    filename = linkage + "HAC2D_cluster"
    plt.savefig(filename)
    plt.clf()
예제 #5
0
파일: hac.py 프로젝트: nserr/SENG474
def generate_3D_plot(x, clusters, linkage):
    hac = ac(n_clusters = clusters, affinity = 'euclidean', linkage = linkage)
    hac.fit_predict(x)

    fig = plt.figure()
    ax = Axes3D(fig)

    ax.set_title("Cluster Map for " + linkage.capitalize() + " Linkage HAC with dataset2")
    ax.scatter(x[:, 0], x[:, 1], x[:, 2], c = hac.labels_)
    filename = linkage + "HAC3D_cluster"
    plt.savefig(filename)
    plt.clf()
예제 #6
0
#fit the data points to the k means algorithm
kmeans.fit(points)

print(kmeans.cluster_centers_)
y_kmeans = kmeans.fit_predict(points)
f1 = plt.figure()
plt.title('K-means clustering')
plt.scatter(points[y_kmeans == 0, 0], points[y_kmeans == 0, 1], c='red')
plt.scatter(points[y_kmeans == 1, 0], points[y_kmeans == 1, 1], c='blue')
plt.scatter(points[y_kmeans == 2, 0], points[y_kmeans == 2, 1], c='black')
plt.scatter(points[y_kmeans == 3, 0], points[y_kmeans == 3, 1], c='cyan')
plt.show()

#create dendogram
#dendogram = sch.dendrogram(sch.linkage(points,method='ward'))
hc = ac(n_clusters=2, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(points)
f2 = plt.figure()

plt.scatter(points[y_hc == 0, 0], points[y_hc == 0, 1], c='red')
plt.scatter(points[y_hc == 1, 0], points[y_hc == 1, 1], c='blue')
plt.scatter(points[y_hc == 2, 0], points[y_hc == 2, 1], c='black')
plt.scatter(points[y_hc == 3, 0], points[y_hc == 3, 1], c='cyan')
plt.title('Heirarchical Clustering')
plt.show()

#Birch clustering
bir = Birch(n_clusters=2, threshold=0.8, branching_factor=200)
bir.fit(points)
y_bir = bir.fit_predict(points)
 def run(self, clusters=3):
     self.y = ac(n_clusters=clusters).fit_predict(self.X)
print "Raw dataset:\n", dataset.head()
x = dataset.iloc[:, [3, 4]].values  #Taking columns 4 & 5
print "Independent variables:\n", x

## Using dendrogram to find the optimal number of clusters
dendrogram = sch.dendrogram(sch.linkage(
    x, method='ward'))  #ward method minimizes variance within each cluster
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()
# Largest distance where we can make vertically without crossing any horizontal line: optimal clusters = 5

## Fitting Hierarchical clustering to the mall dataset
hc = ac(
    n_clusters=5, affinity='euclidean', linkage='ward'
)  #affinity  = distance to make the linkage, ward method minimizes variance within each cluster. Use the same linkage as the one used to build the dendrogram.
y_hc = hc.fit_predict(
    x)  #Fitting AgglomerativeClustering to data x to create vector y.
print "Clusters:\n", y_hc  #y_hc only shows the clusters. Join this with matrix x to analyse the behaviour of each clusters

##Visualising the clusters (Only for 2d clustering i.e. 2 columns of interest)
plt.scatter(x[y_hc == 0, 0],
            x[y_hc == 0, 1],
            s=100,
            c='red',
            label='Cluster 1')
plt.scatter(x[y_hc == 1, 0],
            x[y_hc == 1, 1],
            s=100,
            c='blue',