示例#1
0
    def train(data):
        
                
        X,_ = HAC.preprocess(data)
        test_data = X
        time_constraint = data.time_constraint

        ### optimize by number of clusters
        
        if time_constraint ==  1:
            best_score = -1.1
            best_results = None
            for number in [2,4,8]:
                results = agg(n_clusters=number).fit_predict(test_data)
                if silhouette_score(test_data,results) > best_score:
                    best_score = silhouette_score(test_data,results)
                    best_results = results
        
        if time_constraint == 2:
            best_score = -1.1
            best_results = None
            for number in [2,3,4,5,6,7,8,9,10]:
                results = agg(n_clusters=number).fit_predict(test_data)
                if silhouette_score(test_data,results) > best_score:
                    best_score = silhouette_score(test_data,results)
                    best_results = results
        
        if time_constraint == 3:
            best_score = -1.1
            best_results = None
            for number in [2,4,8]:
                for func in ['euclidean','manhattan']:
                    results = agg(n_clusters=number,affinity=func).fit_predict(test_data)
                    if silhouette_score(test_data,results) > best_score:
                        best_score = silhouette_score(test_data,results)
                        best_results = results
    
        if time_constraint == 4:
            best_score = -1.1
            best_results = None
            for number in [2,3,4,5,6,7,8]:
                for func in ['euclidean','manhattan']:
                    results = agg(n_clusters=number,affinity=func).fit_predict(test_data)
                    if silhouette_score(test_data,results) > best_score:
                        best_score = silhouette_score(test_data,results)
                        best_results = results
                    
        if time_constraint == 5:
            best_score = -1.1
            best_results = None
            for number in [2,3,4,5,6,7,8]:
                for func in ['euclidean','manhattan','l1','l2','cosine']:
                    results = agg(n_clusters=number,affinity=func).fit_predict(test_data)
                    if silhouette_score(test_data,results) > best_score:
                        best_score = silhouette_score(test_data,results)
                        best_results = results
            

        return test_data,None,results,None,None
示例#2
0
def single_link_cluster():

    #data = 'dataset1.csv'
    data = 'dataset2.csv'

    # Read in the data
    df = pd.read_csv(data)

    #x_val = df.iloc[:, [0, 1]].values
    x_val = df.iloc[:, [0, 1, 2]].values

    hac = agg(n_clusters=None, distance_threshold=1, linkage='single')
    hac.fit(x_val)

    # Formulate matrix of linkages for dendrogram plotting
    linkages = create_dendrogram(hac)
    dendrogram(linkages, truncate_mode='lastp')

    plt.title("Dendrogram for Single Linkage HAC on " + data)
    #plt.savefig("singleHAC2D_dendrogram.png")
    plt.savefig("singleHAC3D_dendrogram.png")
    plt.clf()

    #single_cluster2D(x_val)
    single_cluster3D(x_val)
示例#3
0
def radviz_sort_features(matrix, reduce=4):

    Agg = agg(n_clusters=None, distance_threshold=0)
    #m.preprocess(500)
    #data = m.a.X.todense()
    Agg.fit(matrix.T)
    sorted_ft = [a for a in Agg.children_.flatten() if a < matrix.shape[1]]
    if reduce: sorted_ft = [g[0] for g in grouper(sorted_ft, reduce)]
    return matrix[:, sorted_ft]
示例#4
0
def single_cluster2D(x_val):

    hac = agg(n_clusters=28, affinity='euclidean', linkage='single')
    hac.fit_predict(x_val)

    plt.title("Cluster Map for Single Linkage HAC on dataset1.csv")
    plt.scatter(x_val[:, 0], x_val[:, 1], c=hac.labels_)
    plt.savefig("singleHAC2D_cluster.png")
    plt.clf()
示例#5
0
def average_cluster3D(x_val):

    hac = agg(n_clusters=26, affinity='euclidean', linkage='average')
    hac.fit_predict(x_val)

    fig = plt.figure()

    a = Axes3D(fig)
    a.set_title("Cluster Map for Single Linkage HAC on dataset2.csv")
    a.scatter(x_val[:, 0], x_val[:, 1], x_val[:, 2], c=hac.labels_)

    plt.savefig("averageHAC3D_cluster.png")
示例#6
0
def make_tree_parallel_agg(data: np.ndarray, names: np.ndarray) -> Tree:
    """
    Nj tree is made from random sampling with replacement from the distance matrix.
    :param data:
    :param names:
    :return:
    """
    np.random.seed(randint(0, 1000000))
    selected_ids = np.random.choice(np.arange(names.shape[0]),
                                    size=names.shape[0],
                                    replace=True)
    hc = agg()
    hc.fit(data[selected_ids])
    return Tree.from_sklearn(hc, names=names[selected_ids])
示例#7
0
def cluster(M, num_cluster, acc_list, dist_func, smooth_alpha):
    """
    @param M: the matrix of data points to be clustered. Each column is a market
    @param num_cluster: user-defined
    @param acc_list: list of accuracies with ith entry being the model accuracy for ith market's model; this affects 
                     each clusters' weightage. Cluster with higher average cluster will have a higher weightage
    @param dist_func: func(x,y) returns the distance between x and y
    @param smooth_alpha: smoothen each markets to make the clustering easier. When smooth_alpha=5,
                         by taking weekly average price instead of using daily price direcly 
    """
    M = normalize(smooth(M, smooth_alpha))
    D = compute_dist_matrix(M, dist_func)
    clusters = agg(n_clusters=num_cluster,
                   affinity='precomputed',
                   linkage='average').fit_predict(D)
    weightage = np.ones(num_cluster)
    for i in range(num_cluster):
        weightage[i] = np.mean(acc_list[clusters == i])
    weightage = weightage / np.sum(weightage)
    return clusters, weightage
示例#8
0
def prepare_bootstrap_trees_agg(
        data_array: np.ndarray,
        names: [None, List[str]] = None,
        iteration: int = 10,
        n_threads: int = 1,
        linkage: str = "average") -> Tuple[Tree, List[Tree]]:
    if names is None:
        names = [str(x) for x in range(data_array.shape[0])]
    hc = agg(linkage=linkage)
    hc.fit(data_array)
    tree: Tree = Tree.from_sklearn(hc, names)
    names = np.array(names)
    ray.init(num_cpus=n_threads)
    names_ray = ray.put(names)
    data_ray = ray.put(data_array)
    other_trees: List[Tree] = ray.get([
        make_tree_parallel_agg.remote(data_ray, names_ray)
        for _ in range(iteration)
    ])
    ray.shutdown()
    return tree.root, other_trees
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix,
                          axis=0)) / np.sum(contingency_matrix)


df = pd.read_csv("D:\sem3\ds3\data_science_3\lab11\inLab\Iris.csv")
x = list(df["Species"])
df1 = df.iloc[:, 1:5]

pca = PCA(n_components=4).fit(df1)
reduced_data = PCA(n_components=2).fit_transform(df1)

X, Y = zip(*reduced_data)
plt.scatter(X, Y)

agg_clustering = agg(n_clusters=3).fit(reduced_data)
plt.scatter(X, Y, c=agg_clustering.labels_)
plt.title("Agglomerative_clustering_model")
plt.show()
print("Agglomerative_clustering_model purity score is ",
      purity_score(x, agg_clustering.labels_))

print(
    "DBCAN_clustering_model ##################################################33"
)
EPS = [0.05, 0.5, 0.95]
MIN_SAMPLES = [1, 5, 10, 20]
for eps_ in EPS:
    for min_ in MIN_SAMPLES:
        dbscan_clustering = DBSCAN(eps=eps_,
                                   min_samples=min_).fit(reduced_data)