Exemplo n.º 1
0
def main():
    clusters_path = os.path.join(IMAGE_PATH, icio.ic_base_dir, 'clusters')

    # The bottleneck is calc.fingerprints() called in this function, all other
    # operations are very fast. get_image_data() writes fingerprints to disk and
    # loads them again instead of re-calculating them.
    print('\nFingerprinting images...\n')
    images, fingerprints, timestamps = icio.get_image_data(IMAGE_PATH)
    print('\nImage fingerprinting done.\n')

    # Run clustering on the fingerprints. Select clusters with similarity index
    print('\nClustering images...\n')
    clusters = calc.cluster(fingerprints, sim=SIMILARITY)
    print('\nClustering done.\n')

    # Re-format clusters into a simple 2D list
    simple_clusters = list()
    for i, (num_in_cluster, cluster_list) in enumerate(clusters.items()):
        for cluster in cluster_list:
            simple_clusters.append(cluster)

    # Find unclustered images
    unclustered_images = set(images.keys())  # Start set with all images
    for cluster in simple_clusters:
        for image in cluster:
            unclustered_images = unclustered_images.difference(set([image]))
    unclustered_images = list(unclustered_images)  # Convert to list

    if ACTION == 'copy':
        print('\nCopying images to clusters...\n')
    elif ACTION == 'move':
        print('\nMoving images to clusters...\n')

    # Remove existing clusters (if present)
    if os.path.exists(clusters_path):
        shutil.rmtree(clusters_path)

    # Move images into cluster folders
    cluster_dir_length = len(str(len(simple_clusters)))
    for i, cluster in enumerate(simple_clusters):
        cluster_name = str(i).zfill(cluster_dir_length)
        cluster_dir = os.path.join(clusters_path, cluster_name)

        os.makedirs(cluster_dir)
        for image in cluster:
            if ACTION == 'copy':
                shutil.copy(os.path.abspath(image), cluster_dir)
            elif ACTION == 'move':
                shutil.move(os.path.abspath(image), cluster_dir)

    # Move unclustered images too
    for i, image in enumerate(unclustered_images):
        if ACTION == 'copy':
            shutil.copy(os.path.abspath(image), clusters_path)
        elif ACTION == 'move':
            shutil.move(os.path.abspath(image), clusters_path)

    print('\nAll done!')
    print('Clustered images can be found in ' + clusters_path + '\n')
Exemplo n.º 2
0
def test_cluster():
    # use API
    # test clustering
    with ImagedirCtx() as ctx:
        ias = ic.image_arrays(ctx.imagedir, size=(224, 224))
        model = ic.get_model()
        fps = ic.fingerprints(ias, model)
        fps = ic.pca(fps, n_components=0.95)
        clusters = ic.cluster(fps, sim=0.5)
        assert set(clusters.keys()) == set(ctx.clusters.keys())
        for nimg in ctx.clusters.keys():
            for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]):
                msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}"
                assert set(ref_clus) == set(val_clus), msg
Exemplo n.º 3
0
def search_cluster(keyword):
    directory = 'downloads/%s' % keyword

    print('Starting crawler')

    searcher = crawler()
    try:
        print('Searching for %s' % keyword)
        searcher.search(keyword)
        print('Downloading')
        files = searcher.download(32)
    except:
        searcher.stop()
        sys.exit(0)

    # print('Converting pictures into jpg')
    # for file in files:
    #     try:
    #         if not imghdr.what(file) == 'jpeg':
    #             im = Image.open(file)
    #             rgb_im = im.convert('RGB')
    #             rgb_im.save(file + '.jpg')
    #     except:
    #         pass

    images = icio.read_images(directory, size=(224, 224))

    # Create Keras NN model.
    model = calc.get_model()

    # Feed images through the model and extract fingerprints (feature vectors).
    print('Feeding images to the neural network to extract features')
    fingerprints = calc.fingerprints(images, model)

    # Optionally run a PCA on the fingerprints to compress the dimensions. Use a
    # cumulative explained variance ratio of 0.95.
    fingerprints = calc.pca(fingerprints, n_components=0.95)

    # Run clustering on the fingerprints. Select clusters with similarity index
    clusters = calc.cluster(fingerprints, sim=0.5)

    # Create dirs with links to images. Dirs represent the clusters the images
    # belong to.
    postproc.make_links(clusters, directory + '/imagecluster/clusters')

    # Plot images arranged in clusters and save plot.
    fig, ax = postproc.plot_clusters(clusters, images)
Exemplo n.º 4
0
def test_low_level_api_and_clustering():
    # use low level API (same as get_image_data) but call all funcs
    # test clustering
    with ImagedirCtx() as ctx:
        images = icio.read_images(ctx.imagedir, size=(224, 224))
        model = ic.get_model()
        fingerprints = ic.fingerprints(images, model)
        for kk, vv in fingerprints.items():
            assert isinstance(vv, np.ndarray)
            assert len(vv) == 4096, len(vv)
        fingerprints = ic.pca(fingerprints, n_components=0.95)
        clusters = ic.cluster(fingerprints, sim=0.5)
        assert set(clusters.keys()) == set(ctx.clusters.keys())
        assert len(fingerprints.keys()) == len(ctx.image_fns)
        assert set(fingerprints.keys()) == set(ctx.image_fns)
        for nimg in ctx.clusters.keys():
            for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]):
                msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}"
                assert set(ref_clus) == set(val_clus), msg
Exemplo n.º 5
0
# Create Keras NN model.
model = calc.get_model()

# Feed images through the model and extract fingerprints (feature vectors).
fingerprints = calc.fingerprints(images, model)

# Optionally run a PCA on the fingerprints to compress the dimensions. Use a
# cumulative explained variance ratio of 0.95.
fingerprints = calc.pca(fingerprints, n_components=0.95)

# Read image timestamps. Need that to calculate the time distance, can be used
# in clustering.
timestamps = icio.read_timestamps('pics/')

# Run clustering on the fingerprints. Select clusters with similarity index
# sim=0.5. Mix 80% content distance with 20% timestamp distance (alpha=0.2).
clusters = calc.cluster(fingerprints,
                        sim=0.5,
                        timestamps=timestamps,
                        alpha=0.2)

# Create dirs with links to images. Dirs represent the clusters the images
# belong to.
postproc.make_links(clusters, 'pics/imagecluster/clusters')

# Plot images arranged in clusters and save plot.
fig, ax = postproc.plot_clusters(clusters, images)
fig.savefig('foo.png')
postproc.plt.show()
Exemplo n.º 6
0
from imagecluster import calc as ic
from imagecluster import postproc as pp

# Create image database in memory. This helps to feed images to the NN model
# quickly.
ias = ic.image_arrays('pics/', size=(224,224))

# Create Keras NN model.
model = ic.get_model()

# Feed images through the model and extract fingerprints (feature vectors).
fps = ic.fingerprints(ias, model)

# Optionally run a PCA on the fingerprints to compress the dimensions. Use a
# cumulative explained variance ratio of 0.95.
fps = ic.pca(fps, n_components=0.95)

# Run clustering on the fingerprints.  Select clusters with similarity index
# sim=0.5
clusters = ic.cluster(fps, sim=0.5)

# Create dirs with links to images. Dirs represent the clusters the images
# belong to.
pp.make_links(clusters, 'pics/imagecluster/clusters')

# Plot images arranged in clusters.
pp.visualize(clusters, ias)
Exemplo n.º 7
0
#!/usr/bin/python3

from matplotlib import pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram

from imagecluster import calc as ic
from imagecluster import io as icio

images = icio.read_images('pics/', size=(224, 224))
model = ic.get_model()
fingerprints = ic.fingerprints(images, model)
clusters, extra = ic.cluster(fingerprints, sim=0.5, extra_out=True)

# linkage matrix Z
fig, ax = plt.subplots()
dendrogram(extra['Z'], ax=ax)

# Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim`
# parameter.
ymin, ymax = ax.yaxis.get_data_interval()
tlocs = np.linspace(ymin, ymax, 5)
ax.yaxis.set_ticks(tlocs)
tlabels = np.linspace(1, 0, len(tlocs))
ax.yaxis.set_ticklabels(tlabels)
ax.set_xlabel("image index")
ax.set_ylabel("sim")

fig.savefig('dendrogram.png')
plt.show()
from matplotlib import pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram

from imagecluster import calc as ic

ias = ic.image_arrays('pics/', size=(224, 224))
model = ic.get_model()
fps = ic.fingerprints(ias, model)
clusters, extra = ic.cluster(fps, sim=0.5, extra_out=True)

# linkage matrix Z
Z = extra['Z']

fig, ax = plt.subplots()
dendrogram(Z, ax=ax)

# Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim`
# parameter.
ymin, ymax = ax.yaxis.get_data_interval()
tlocs = np.linspace(ymin, ymax, 5)
ax.yaxis.set_ticks(tlocs)
tlabels = np.linspace(1, 0, len(tlocs))
ax.yaxis.set_ticklabels(tlabels)
ax.set_xlabel("image index")
ax.set_ylabel("sim")

fig.savefig('dendrogram.png')
plt.show()
Exemplo n.º 9
0
#!/usr/bin/python3

# Minimal example. Use the convenience function io.get_image_data() without any
# extra arguments.

from imagecluster import calc, io as icio, postproc

# The bottleneck is calc.fingerprints() called in this function, all other
# operations are very fast. get_image_data() writes fingerprints to disk and
# loads them again instead of re-calculating them.
images, fingerprints, timestamps = icio.get_image_data('downloads/cart icon/')

# Run clustering on the fingerprints. Select clusters with similarity index
# sim=0.5.
clusters = calc.cluster(fingerprints, sim=0.5)

# Create dirs with links to images. Dirs represent the clusters the images
# belong to.
postproc.make_links(clusters, 'downloads/cart icon/imagecluster/clusters')

# Plot images arranged in clusters.
postproc.visualize(clusters, images)
Exemplo n.º 10
0
def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False,
         max_csize=None, pca=False, pca_params=dict(n_components=0.9)):
    """Example main app using this library.

    Upon first invocation, the image and fingerprint databases are built and
    written to disk. Each new invocation loads those and only repeats
        * clustering
        * creation of links to files in clusters
        * visualization (if `vis=True`)

    This is good for playing around with the `sim` parameter, for
    instance, which only influences clustering.

    Parameters
    ----------
    imagedir : str
        path to directory with images
    sim : float (0..1)
        similarity index (see :func:`calc.cluster`)
    layer : str
        which layer to use as feature vector (see
        :func:`calc.get_model`)
    size : tuple
        input image size (width, height), must match `model`, e.g. (224,224)
    links : bool
        create dirs with links
    vis : bool
        plot images in clusters
    max_csize : max number of images per cluster for visualization (see
        :mod:`~postproc`)
    pca : bool
        Perform PCA on fingerprints before clustering, using `pca_params`.
    pca_params : dict
        kwargs to sklearn's PCA

    Notes
    -----
    imagedir : To select only a subset of the images, create an `imagedir` and
        symlink your selected images there. In the future, we may add support
        for passing a list of files, should the need arise. But then again,
        this function is only an example front-end.
    """
    fps_fn = pj(imagedir, ic_base_dir, 'fingerprints.pk')
    ias_fn = pj(imagedir, ic_base_dir, 'images.pk')
    ias = None
    if not os.path.exists(fps_fn):
        print(f"no fingerprints database {fps_fn} found")
        os.makedirs(os.path.dirname(fps_fn), exist_ok=True)
        model = ic.get_model(layer=layer)
        if not os.path.exists(ias_fn):
            print(f"create image array database {ias_fn}")
            ias = ic.image_arrays(imagedir, size=size)
            co.write_pk(ias, ias_fn)
        else:
            ias = co.read_pk(ias_fn)
        print("running all images through NN model ...")
        fps = ic.fingerprints(ias, model)
        co.write_pk(fps, fps_fn)
    else:
        print(f"loading fingerprints database {fps_fn} ...")
        fps = co.read_pk(fps_fn)
    if pca:
        fps = ic.pca(fps, **pca_params)
        print("pca dims:", list(fps.values())[0].shape[0])
    print("clustering ...")
    clusters = ic.cluster(fps, sim)
    if links:
        pp.make_links(clusters, pj(imagedir, ic_base_dir, 'clusters'))
    if vis:
        if ias is None:
            ias = co.read_pk(ias_fn)
        pp.visualize(clusters, ias, max_csize=max_csize)