示例#1
0
def do_compute(reference_txt, pre_clustering_txt, groundtruth_npy):

    # load reference clusters
    reference = Clustering.load(reference_txt)

    # load hypothesis clusters
    hypothesis = Clustering.load(pre_clustering_txt)

    # number of hypothesis clusters
    nPreClusters = len(hypothesis.clusters)
    preClusters = sorted(hypothesis.clusters)

    # groundtruth[i, j] contains
    # 1 if all elements in clusters i and j are in the same cluster
    # 0 if elements in clusters i and j are not in the same cluster
    # -1 if either cluster i or j is not pure
    groundtruth = np.empty((nPreClusters, nPreClusters), dtype=int)

    # clustersRef[c] contains reference cluster for pure hypothesis cluster c
    # in case c is not pure, clustersRef[c] is None
    clustersRef = {}
    for c in preClusters:
        r = set([reference[i] for i in hypothesis.clusters[c]])
        if len(r) == 1:
            clustersRef[c] = r.pop()
        else:
            clustersRef[c] = None

    for k, ci in enumerate(preClusters):
        if clustersRef[ci] is None:
            groundtruth[ci, :] = -1
            groundtruth[:, ci] = -1
            continue
        for cj in preClusters[k:]:
            if clustersRef[cj] is not None:
                groundtruth[ci, cj] = clustersRef[ci] == clustersRef[cj]
                groundtruth[cj, ci] = groundtruth[ci, cj]

    # save groundtruth matrix
    np.save(groundtruth_npy, groundtruth)
示例#2
0
def do_it(image_txt, features_npy, clustering_txt, output_npy):

    # load image list
    with open(image_txt, 'r') as f:
        images = [int(line.strip()) for line in f.readlines()]
        image2index = {image: index for index, image in enumerate(images)}

    # load hypothesis clusters
    clustering = Clustering.load(clustering_txt)
    clusters = sorted(clustering.clusters)

    # load features
    features = np.load(features_npy)

    # L2 normalization (for later dot product)
    features = (features.T / np.sqrt(np.sum((features**2), axis=1))).T

    # find centroid image for every cluster
    centroid = {}
    for c, cluster in enumerate(clusters):

        # list of images in current cluster
        _images = clustering.clusters[cluster]

        # corresponding indices in features matrix
        _indices = [image2index[image] for image in _images]

        # compute distance matrix between
        # all images of current cluster
        _features = features[_indices, :]
        _distance = 1. - np.dot(_features, _features.T)

        # find centroid image
        i = np.argmin(np.sum(_distance, axis=0))
        centroid[cluster] = _images[i]

        print 'image %s is centroid of cluster %s' % (centroid[cluster],
                                                      cluster)

    # centroid indices in features matrix
    _indices = [image2index[centroid[cluster]] for cluster in clusters]

    # compute distance matrix between all centroids
    _features = features[_indices, :]
    _distance = 1. - np.dot(_features, _features.T)

    # save distance matrix
    with open(output_npy, 'wb') as f:
        np.save(f, _distance)
示例#3
0
def do_it(image_txt, features_npy, clustering_txt, output_npy):

    # load image list
    with open(image_txt, 'r') as f:
        images = [int(line.strip()) for line in f.readlines()]
        image2index = {image: index for index, image in enumerate(images)}

    # load hypothesis clusters
    clustering = Clustering.load(clustering_txt)
    clusters = sorted(clustering.clusters)

    # load features
    features = np.load(features_npy)

    # L2 normalization (for later dot product)
    features = (features.T / np.sqrt(np.sum((features ** 2), axis=1))).T

    # find centroid image for every cluster
    centroid = {}
    for c, cluster in enumerate(clusters):

        # list of images in current cluster
        _images = clustering.clusters[cluster]

        # corresponding indices in features matrix
        _indices = [image2index[image] for image in _images]

        # compute distance matrix between
        # all images of current cluster
        _features = features[_indices, :]
        _distance = 1. - np.dot(_features, _features.T)

        # find centroid image
        i = np.argmin(np.sum(_distance, axis=0))
        centroid[cluster] = _images[i]

        print 'image %s is centroid of cluster %s' % (centroid[cluster], cluster)

    # centroid indices in features matrix
    _indices = [image2index[centroid[cluster]] for cluster in clusters]

    # compute distance matrix between all centroids
    _features = features[_indices, :]
    _distance = 1. - np.dot(_features, _features.T)

    # save distance matrix
    with open(output_npy, 'wb') as f:
        np.save(f, _distance)
示例#4
0
    for k, v in cluster.iteritems():
        for photo in v:
            file.write("%d\t%d\n" % (photo, k))

    file.close()


print "Loading json into memory..."
dictionary = readjson(
    "/vol/corpora4/mediaeval/2014/SED_2014_Dev_Metadata.json")
print "...Done !"

clusterU = clusterUser(dictionary, fileID)
clusterD = clusterDate(dictionary, fileID, clusterU)

print_result_file(clusterD, fileOUT)

reference = Clustering.load(fileREF)
hypothesis = Clustering.load(fileOUT)

images = []
for c in clusterD.values():
    for i in range(0, len(c)):
        images.append(c[i])

h = homogeneity(reference, hypothesis, images)
print h
c = completeness(reference, hypothesis, images)
print c
示例#5
0
	file = open(filename, "w")

	for k, v in cluster.iteritems():
		for photo in v:
			file.write("%d\t%d\n" % (photo, k))

	file.close()

print "Loading json into memory..."
dictionary = readjson("/vol/corpora4/mediaeval/2014/SED_2014_Dev_Metadata.json")
print "...Done !"

clusterU = clusterUser(dictionary, fileID)
clusterD = clusterDate(dictionary, fileID, clusterU)

print_result_file(clusterD, fileOUT)

reference = Clustering.load(fileREF)
hypothesis = Clustering.load(fileOUT)

images = []
for c in clusterD.values():
	for i in range (0, len(c)):
		images.append(c[i])

h = homogeneity(reference, hypothesis, images)
print h
c = completeness(reference, hypothesis, images)
print c