Пример #1
0
def clusterize_aggroCluster(videos_hash, nb_clusters):
    videos_indice, X = createInput(videos_hash)
    aggroCluster = AgglomerativeClustering(n_clusters=nb_clusters, linkage='ward', affinity='euclidean').fit(X)
    listCluster = createClusterList(aggroCluster.fit_predict(X), videos_indice)
    ri = rand_index(listCluster)
    print(ri)
    return ri, videos_indice, aggroCluster, listCluster
Пример #2
0
def clusterize_kmeans(videos_hash, nb_clusters):
    videos_indice, X = createInput(videos_hash)
    kmeans = KMeans(n_clusters=nb_clusters, random_state=0, n_init=5, max_iter=300).fit(X)
    listCluster = createClusterList(kmeans.predict(X), videos_indice)
    ri = rand_index(listCluster)
    print(ri)
    return ri, videos_indice, kmeans, listCluster
Пример #3
0
def main():
    counter = 0
    classes = {}
    for line in fileinput.input():
        class_name, observation_name, extension = [
            p for s in line.split(' ') for p in s.split('.')
        ]
        if class_name not in classes:
            classes[class_name] = set()
        classes[class_name].add(observation_name)

    for class_name in classes:
        print(class_name, classes[class_name])

    clusters = list(classes.values())
    for cluster in clusters:
        pass  #print(cluster)
    rand_index(clusters)
Пример #4
0
def comparison(labels,names):
    k = len(set(labels))
    clusters = []

    for i in range(k):
        sets = [set() for _ in xrange(970)]
        for i in range(9700):
            sets[labels[i]].update([names[i]])
    print sets
    return rand_index(sets)
Пример #5
0
def main(n_clusters, do_weight, cluster):

    # devide video file paths into chunks to be sent to processes
    if rank == 0:
        start_time = time.time()
        video_files = load_filenames(n_clusters=n_clusters)
        video_names = map(path_to_name, video_files)
        chunks = [[] for _ in range(size)]
        for i, chunk in enumerate(zip(video_files, video_names)):
            chunks[i % size].append(chunk)
    else:
        video_files_and_names = None
        chunks = None
    comm.Barrier()

    # scatter data to each process handle hashing in each process
    video_files_and_names = comm.scatter(chunks, root=0)
    video_files, names = zip(*video_files_and_names)
    videos, weights = zip(*map(
        lambda x: generate_video_representation(x, do_weight), video_files))

    # gather the result from each process
    data = comm.gather(zip(videos, weights, names), root=0)

    # only do clustering in process 0, which gathered all data
    if rank == 0:
        videos, weights, video_names = zip(
            *[pair for paired_data in data for pair in paired_data])
        videos = np.asarray(list(videos))

        # cluster the videos
        if cluster == 'kmeans':
            clusters = cluster_videos_kmeans(videos, weights, video_names,
                                             n_clusters)
        elif cluster == 'gmm':
            clusters = cluster_videos_gmm(videos,
                                          weights,
                                          video_names,
                                          n_clusters,
                                          cov_type="diag")
        elif cluster == 'ac':
            clusters = cluster_videos_ac(videos, weights, video_names,
                                         n_clusters)
        time_end = time.time() - start_time

        # score the clustering method
        score = rand_index(clusters, n_clusters)

        print "Scores: ", np.round(score,
                                   2), "\nExecution time: %s" % (time_end)
Пример #6
0
def main(n_clusters, do_weight, cluster):
    video_files = load_filenames(n_clusters=n_clusters)
    video_names = map(path_to_name, video_files)
    #generate_video_representation(video_files[0])
    videos, weights = zip(*map(
        lambda x: generate_video_representation(x, do_weight), video_files))
    videos = np.asarray(list(videos))

    # cluster the images
    if cluster == 'kmeans':
        clusters = cluster_videos_kmeans(videos, weights, video_names,
                                         n_clusters)
    elif cluster == 'gmm':
        clusters = cluster_videos_gmm(videos, weights, video_names, n_clusters)
    elif cluster == 'ac':
        clusters = cluster_videos_ac(videos, weights, video_names, n_clusters)
    # score the clustering method
    score = rand_index(clusters, n_clusters)

    print score
Пример #7
0
def pred_to_clusters(list_filenames,pred,n_clusters):
    names = [path_to_name(path) for path in list_filenames]
    clusters = [set() for _ in range(n_clusters)]
    for idx , clus in enumerate(pred):
        clusters[clus].add(names[idx])
    return adjusted_rand_index.rand_index(clusters,n_clusters)
Пример #8
0
def check_res(result_sets):
    import adjusted_rand_index
    #We just simply run the script given by the teacher and print the results
    print(adjusted_rand_index.rand_index(result_sets))
Пример #9
0
	with open("features.txt", "wb") as fp:   #Pickling
		pickle.dump(features, fp)
	with open("names.txt", "wb") as fb:   #Pickling
		pickle.dump(names, fb)
	#=========================================

	#================CLUSTERING =================
	labels = cluster_videos(features,names)
	#============================================
	
	t2 = datetime.now()

	#===========CHECK ADJ. RAND INDEX =====================

	#Sort the video name list by the clustered labels 
	sidx = np.argsort(labels)
	split_idx = np.flatnonzero(np.diff(np.take(labels,sidx))>0)+1
	out = np.split(np.take(names,sidx,axis=0), split_idx)
	clusters = list(set(L) for L in out)

	#Run Davids true clusters and comp Adj.Rand Index
	score = ARI.rand_index(clusters)

	#=============PRINT SOME NICE STUFF OUT================
	print('Computational time')
	print('Processing: {}\nClustering: {}\nOverall: {}\n'.format(t1-t0,t2-t1,t2-t0))
	print('Adj. Rand Index: {:.5f}'.format(score))
	#=======================================================


Пример #10
0
print("Using " + str(full_data_feat.shape[1]) + " features.")

# Compute the similarity matrix by using hamming distance
# Use matrix broadcasting
print("Computing distances...")
sim_matrix = (full_data_feat[:, None, :] != full_data_feat).sum(2)

# Bottom up hierarchical clustering with complete/maximal cluster distance
print("Clustering data...")
model = AgglomerativeClustering(n_clusters=970,
                                linkage="complete",
                                affinity="precomputed")
# Input the similarity matrix and fit the data to the model
model.fit(sim_matrix)

# Print snippets of the assigned clusters
print(model.labels_)

# For each video get the cluster id and
# store the name of the video in a set
clusters = [-1] * 970
for i, name in enumerate(full_data_name):
    cluster_idx = model.labels_[i]
    if clusters[cluster_idx] == -1:  # No name has been assigned yet.
        clusters[cluster_idx] = {name.split('.')[0]}
    else:  # Add to previous set
        clusters[cluster_idx].add(name.split('.')[0])

# Compute adjusted rand index and report the time used
print("Final rand index = ", rand_index(clusters))
print("Query took %0.2f seconds" % (time.time() - t0))