def test_cluster():
    # use API
    # test clustering
    with ImagedirCtx() as ctx:
        ias = ic.image_arrays(ctx.imagedir, size=(224, 224))
        model = ic.get_model()
        fps = ic.fingerprints(ias, model)
        fps = ic.pca(fps, n_components=0.95)
        clusters = ic.cluster(fps, sim=0.5)
        assert set(clusters.keys()) == set(ctx.clusters.keys())
        for nimg in ctx.clusters.keys():
            for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]):
                msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}"
                assert set(ref_clus) == set(val_clus), msg
def search_cluster(keyword):
    directory = 'downloads/%s' % keyword

    print('Starting crawler')

    searcher = crawler()
        print('Searching for %s' % keyword)
        files = searcher.download(32)

    images = icio.read_images(directory, size=(224, 224))

    # Create Keras NN model.
    model = calc.get_model()

    # Feed images through the model and extract fingerprints (feature vectors).
    print('Feeding images to the neural network to extract features')
    fingerprints = calc.fingerprints(images, model)

    # Optionally run a PCA on the fingerprints to compress the dimensions. Use a
    # cumulative explained variance ratio of 0.95.
    fingerprints = calc.pca(fingerprints, n_components=0.95)

    # Run clustering on the fingerprints. Select clusters with similarity index
    clusters = calc.cluster(fingerprints, sim=0.5)

    # Create dirs with links to images. Dirs represent the clusters the images
    # belong to.
    postproc.make_links(clusters, directory + '/imagecluster/clusters')

    # Plot images arranged in clusters and save plot.
    fig, ax = postproc.plot_clusters(clusters, images)
def test_low_level_api_and_clustering():
    # use low level API (same as get_image_data) but call all funcs
    # test clustering
    with ImagedirCtx() as ctx:
        images = icio.read_images(ctx.imagedir, size=(224, 224))
        model = ic.get_model()
        fingerprints = ic.fingerprints(images, model)
        for kk, vv in fingerprints.items():
            assert isinstance(vv, np.ndarray)
            assert len(vv) == 4096, len(vv)
        fingerprints = ic.pca(fingerprints, n_components=0.95)
        clusters = ic.cluster(fingerprints, sim=0.5)
        assert set(clusters.keys()) == set(ctx.clusters.keys())
        assert len(fingerprints.keys()) == len(ctx.image_fns)
        assert set(fingerprints.keys()) == set(ctx.image_fns)
        for nimg in ctx.clusters.keys():
            for val_clus, ref_clus in zip(clusters[nimg], ctx.clusters[nimg]):
                msg = f"ref_clus: {ref_clus}, val_clus: {val_clus}"
                assert set(ref_clus) == set(val_clus), msg
from imagecluster import calc, io as icio, postproc

# Create image database in memory. This helps to feed images to the NN model
# quickly.
images = icio.read_images('pics/', size=(224, 224))

# Create Keras NN model.
model = calc.get_model()

# Feed images through the model and extract fingerprints (feature vectors).
fingerprints = calc.fingerprints(images, model)

# Optionally run a PCA on the fingerprints to compress the dimensions. Use a
# cumulative explained variance ratio of 0.95.
fingerprints = calc.pca(fingerprints, n_components=0.95)

# Read image timestamps. Need that to calculate the time distance, can be used
# in clustering.
timestamps = icio.read_timestamps('pics/')

# Run clustering on the fingerprints. Select clusters with similarity index
# sim=0.5. Mix 80% content distance with 20% timestamp distance (alpha=0.2).
clusters = calc.cluster(fingerprints,
from imagecluster import calc as ic
from imagecluster import postproc as pp

# Create image database in memory. This helps to feed images to the NN model
# quickly.
ias = ic.image_arrays('pics/', size=(224,224))

# Create Keras NN model.
model = ic.get_model()

# Feed images through the model and extract fingerprints (feature vectors).
fps = ic.fingerprints(ias, model)

# Optionally run a PCA on the fingerprints to compress the dimensions. Use a
# cumulative explained variance ratio of 0.95.
fps = ic.pca(fps, n_components=0.95)

# Run clustering on the fingerprints.  Select clusters with similarity index
# sim=0.5
clusters = ic.cluster(fps, sim=0.5)

# Create dirs with links to images. Dirs represent the clusters the images
# belong to.
pp.make_links(clusters, 'pics/imagecluster/clusters')

# Plot images arranged in clusters.
pp.visualize(clusters, ias)

from matplotlib import pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram

from imagecluster import calc as ic
from imagecluster import io as icio

images = icio.read_images('pics/', size=(224, 224))
model = ic.get_model()
fingerprints = ic.fingerprints(images, model)
clusters, extra = ic.cluster(fingerprints, sim=0.5, extra_out=True)

# linkage matrix Z
fig, ax = plt.subplots()
dendrogram(extra['Z'], ax=ax)

# Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim`
# parameter.
ymin, ymax = ax.yaxis.get_data_interval()
tlocs = np.linspace(ymin, ymax, 5)
tlabels = np.linspace(1, 0, len(tlocs))
ax.set_xlabel("image index")

def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False,
         max_csize=None, pca=False, pca_params=dict(n_components=0.9)):
    """Example main app using this library.

    Upon first invocation, the image and fingerprint databases are built and
    written to disk. Each new invocation loads those and only repeats
        * clustering
        * creation of links to files in clusters
        * visualization (if `vis=True`)

    This is good for playing around with the `sim` parameter, for
    instance, which only influences clustering.

    imagedir : str
        path to directory with images
    sim : float (0..1)
        similarity index (see :func:`calc.cluster`)
    layer : str
        which layer to use as feature vector (see
    size : tuple
        input image size (width, height), must match `model`, e.g. (224,224)
    links : bool
        create dirs with links
    vis : bool
        plot images in clusters
    max_csize : max number of images per cluster for visualization (see
    pca : bool
        Perform PCA on fingerprints before clustering, using `pca_params`.
    pca_params : dict
        kwargs to sklearn's PCA

    imagedir : To select only a subset of the images, create an `imagedir` and
        symlink your selected images there. In the future, we may add support
        for passing a list of files, should the need arise. But then again,
        this function is only an example front-end.
    fps_fn = pj(imagedir, ic_base_dir, 'fingerprints.pk')
    ias_fn = pj(imagedir, ic_base_dir, 'images.pk')
    ias = None
    if not os.path.exists(fps_fn):
        print(f"no fingerprints database {fps_fn} found")
        os.makedirs(os.path.dirname(fps_fn), exist_ok=True)
        model = ic.get_model(layer=layer)
        if not os.path.exists(ias_fn):
            print(f"create image array database {ias_fn}")
            ias = ic.image_arrays(imagedir, size=size)
            co.write_pk(ias, ias_fn)
            ias = co.read_pk(ias_fn)
        print("running all images through NN model ...")
        fps = ic.fingerprints(ias, model)
        co.write_pk(fps, fps_fn)
        print(f"loading fingerprints database {fps_fn} ...")
        fps = co.read_pk(fps_fn)
    if pca:
        fps = ic.pca(fps, **pca_params)
        print("pca dims:", list(fps.values())[0].shape[0])
    print("clustering ...")
    clusters = ic.cluster(fps, sim)
    if links:
        pp.make_links(clusters, pj(imagedir, ic_base_dir, 'clusters'))
    if vis:
        if ias is None:
            ias = co.read_pk(ias_fn)
        pp.visualize(clusters, ias, max_csize=max_csize)
def main_kmeans(imagedir,
                size=(224, 224),
    """Example main app using this library.

    Upon first invocation, the image and fingerprint databases are built and
    written to disk. Each new invocation loads those and only repeats
        * clustering
        * creation of links to files in clusters
        * visualization (if `vis=True`)

    This is good for playing around with the `sim` parameter, for
    instance, which only influences clustering.

    imagedir : str
        path to directory with images
    n_cluster : int (1...999)
        num of kmeans cluster (see :func:`calc.cluster_kmeans`)
    layer : str
        which layer to use as feature vector (see
    size : tuple
        input image size (width, height), must match `model`, e.g. (224,224)
    links : bool
        create dirs with links
    pca : bool
        Perform PCA on fingerprints before clustering, using `pca_params`.
    pca_params : dict
        kwargs to sklearn's PCA

    imagedir : To select only a subset of the images, create an `imagedir` and
        symlink your selected images there. In the future, we may add support
        for passing a list of files, should the need arise. But then again,
        this function is only an example front-end.
    fps_fn = pj(imagedir, ic_base_dir, 'fingerprints.pk')
    ias_fn = pj(imagedir, ic_base_dir, 'images.pk')
    ias = None
    logger_kmeans = log(logger_name='kmeans').logger
        if not os.path.exists(fps_fn):
            print("no fingerprints database {} found".format(fps_fn))
                "no fingerprints database {} found".format(fps_fn))
            os.makedirs(os.path.dirname(fps_fn), exist_ok=True)
                model = ic.get_model(layer=layer)
            except Exception as e:
            if not os.path.exists(ias_fn):
                    "create image array database {}".format(ias_fn))
                print("create image array database {}".format(ias_fn))
                ias = ic.image_arrays(imagedir, size=size)
                co.write_pk(ias, ias_fn)
                ias = co.read_pk(ias_fn)
            print("running all images through NN model ...")
            fps = ic.fingerprints(ias, model)
            co.write_pk(fps, fps_fn)
            print("loading fingerprints database {} ...".format(fps_fn))
            fps = co.read_pk(fps_fn)
        if pca:
            fps = ic.pca(fps, **pca_params)
            print("pca dims:", list(fps.values())[0].shape[0])
            logger_kmeans.info("pca dims: " +
        print("clustering ...")
        logger_kmeans.info("clustering ...")
        clusters = ic.cluster_kmeans(fps, n_clusters=n_clusters)
        if links:
            pp.make_links_v2(clusters, pj(imagedir, ic_base_dir, 'clusters'))
    except Exception as e: