示例#1
0
def kmeans(config):
    """ Kmeans with merging and counting of perceptive hashes and 
    ward hashes among clusters."""
    measures = sc.pickleFile(hdfs_path(config, "map_each_image", "measures"))
    data = measures.map(lambda x: (x[1]["id"], flatten_hist_cen(x[1]), x[1]["phash"], x[1]["ward"])).cache()
    K = config["n_clusters_group"]
    convergeDist = config["kmeans_group_converge"]
    sample = data.takeSample(False, K, 1)
    kPoints = [k[1] for k in sample]
    tempDist = 10 * convergeDist
    idx = 0
    within_set_sse = []
    while tempDist > convergeDist:
        max_len = config["in_memory_set_len"] / K
        ward_max_len = int(0.5 * max_len)
        phash_max_len = int(max_len - ward_max_len)
        closest = data.map(partial(km_map, kPoints))
        pointStats = closest.reduceByKey(partial(reduce_dist, ward_max_len, phash_max_len))
        pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w)))
        tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum((kPoints[x] - y) ** 2)).sum()
        newPoints = pts_hash_union.map(lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect()
        idx += 1
        if idx > config["max_iter_group"]:
            break
        print("kmeans did iteration: ", idx, file=sys.stderr)
    for (x, y) in newPoints:
        kPoints[x] = y
    phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u)
    phash_unions.saveAsPickleFile(hdfs_path(config, "km", "phash_unions"))
    ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w)
    ward_unions.saveAsPickleFile(hdfs_path(config, "km", "ward_unions"))
    # The rest of the function deals with writing various lookup tables.

    # save the fit data and the meta stats as a single item in list
    kpsave = sc.parallelize([kPoints, tempDist, within_set_sse])
    kpsave.saveAsPickleFile(hdfs_path(config, "km", "cluster_center_meta"))

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

    options = options_template.copy()
    options.update(config["kmeans_output"])
    for k, v in options.items():
        if v:
            flat(k)
示例#2
0
def find_similar(sc, config):
    """Use cluster to hash and hash to key 
    joins to find_similar images.

    TODO: more rounds of search, and have an 
    option to do ward OR perceptive hash OR both.
    Ward is more expansive (false positives) than 
    perceptive hashes, so the join can get slow with many
    matches.  Maybe ward hashes should be a second try."""
    kmeans_meta = sc.pickleFile(hdfs_path(config, 'km','cluster_center_meta'))
    kmeans_meta = kmeans_meta.map(lambda x:x).collect()
    kPoints, tempDist, within_set_sse = kmeans_meta
    phash_unions = sc.pickleFile(
                    hdfs_path(config, 'km', 'phash_unions')
                ).map(
                    lambda x:x
                ).collect()
    ward_unions = sc.pickleFile(
                    hdfs_path(config, 'km', 'ward_unions')
                ).map(lambda x:x).collect()
    if not config.get('candidate_has_mapped'):
        scores = map_each_image(sc, 
                config, 
                config['candidate_spec'], 
                config['candidate_measures_spec'])
    else:
        scores = sc.pickleFile(config['candidate_measures_spec'])
    scores.cache()
    for net_round in range(config['search_rounds']):
        samples = join_nearest(sc,
                                config,
                                kPoints, 
                                phash_unions,
                                ward_unions,
                            scores)
        
        #TODO logic here for more rounds of sampling
    return samples
def fuzzify(config, fname, hdfs_name):
    from PIL import Image
    img = Image.open(fname)
    n = np.array(img)
    for i in range(0, n.shape[0] - filterx, filterx):
        for j in range(0, n.shape[1] - filtery, filtery):
            if random.uniform(0, 1) < change_perc:
                for z in range(3):
                    n[i:i + filterx, j:j + filtery,
                      z] = np.median(n[i:i + filterx, j:j + filtery, z])
    new = Image.fromarray(np.array(np.round(n), dtype=np.uint8))
    loc_name = fname + 'fuz'
    new.save(loc_name, format="png")
    print(
        sp.Popen([
            'hadoop', 'fs', '-put', loc_name,
            hdfs_path(config, config['fuzzy_example_data'], hdfs_name)
        ]).communicate())
def fuzzify(config, fname, hdfs_name):
    from PIL import Image
    img = Image.open(fname)
    n = np.array(img)
    for i in range(0,n.shape[0] -filterx, filterx):
        for j in range(0,n.shape[1]-filtery, filtery):
            if random.uniform(0, 1) < change_perc:
                for z in range(3):
                    n[i: i+filterx, j:j+filtery,z] = np.median(n[i:i+filterx,j:j+filtery,z])
    new = Image.fromarray(np.array(np.round(n), dtype=np.uint8))
    loc_name = fname + 'fuz'
    new.save(loc_name,format="png")
    print(sp.Popen(['hadoop', 
    'fs',
    '-put', 
    loc_name, 
    hdfs_path(config, 
    config['fuzzy_example_data'], 
    hdfs_name)]).communicate())
示例#5
0
def join_nearest(sc, 
                config, 
                kPoints, 
                phash_unions, 
                ward_unions, 
                scores):
    
    """Use candidates' scores to assign them to best clusters based 
    on euclidean distance and number of matching hashes, ward or perceptive.
    Join those assigned clusters to perceptive and ward hashes from training
    and then join hashes to keys."""
    pw = tuple(zip(('phash', 'ward'), (phash_unions, ward_unions)))
    def _best_cluster(x):
        counts =[]
        for ward_or_phash, unions in pw:
            counts.append([])
            for u in unions:
                p = 0
                for item in x[1][ward_or_phash]:
                    p += u.get(item, 0)
                counts[-1].append(p)
        best = list(map(np.argmax, counts))
        distances = [np.sum((kPoints[i] - flatten_hist_cen(x[1]))**2) for i in range(len(kPoints))]
        best.append(np.argmin(distances))
        # TODO I think the following line has a typo distances[b] is what is should be.
        return [(x[0], (b, 'self', distances[best[-1]], x[1]['ward'], x[1]['phash'])) for b in best]
    best_clusters = scores.flatMap(_best_cluster)
    best_clusters
    best_clusters.sortBy(lambda x:x[1][2]).cache()
    phash_c = best_clusters.flatMap(
                    partial(cluster_chunk, 
                                config,
                                'phash'))
    phash_c_id = phash_c.map(lambda x:x[1])
    ward_c = best_clusters.flatMap(partial(cluster_chunk, 
                                        config,
                                        'ward'))
    ward_c_id = ward_c.map(lambda x:x[1])
    cluster_to_phash = sc.pickleFile(hdfs_path(config, 
                                                'km', 
                                                'cluster_to_phash'))
    cluster_to_ward = sc.pickleFile(hdfs_path(config, 
                                            'km', 
                                            'cluster_to_ward'))
    rdds = ( ward_c, phash_c)
    rdds2 = (ward_c_id, phash_c_id)
    table_names = ('ward_matches','phash_matches')
    labels = ('ward_to_key','phash_to_key')
    out = {}
    to_join = []
    for table, rdd, rdd2, label in zip(table_names, rdds, rdds2, labels):
    
        join_on_cluster = rdd.join(
            cluster_to_phash if table == 'phash_matches' else cluster_to_ward
        )
        map_ward_or_phash = join_on_cluster.map(lambda x:(x[1][0][0], x))
        to_key = sc.pickleFile(hdfs_path(config, 'km', label))
        hash_joined = map_ward_or_phash.join(
            to_key
        )
        hash_joined2 = rdd2.join(to_key)
        
        # pulling the two image keys out into pairs
        cand_key_to_key = hash_joined.map(
            lambda x: (x[1][0][1][0][1], x[1][-1])
        )
        samp = cand_key_to_key.take(config['search_sample_step'])
        out[table] = samp
        as_key_counts = cand_key_to_key.groupByKey(
            ).map(
            count_keys
            )
        as_key_counts.cache()
        as_key_counts.saveAsPickleFile(
            hdfs_path(config, 'candidates', config['candidate_batch'], "%s_counts" % label)
        )
        to_join.append(as_key_counts)
    # map the candidate id with best match of a hash with indicators of fit
    def map_best(x):
        """The key, (best agreeing key, vote count for agreeing, total votes) """
        (key, ((best_match, agree_count), dict_)) = x
        return (key, (best_match, agree_count, sum(dict_.values())))
    
    # join the ward best key with phash best key
    joined_final_matches = to_join[0].map(
                            map_best
                        ).join(
                            to_join[1].map(map_best)
                        )
    joined_final_matches.saveAsPickleFile(
            hdfs_path(config, 'candidates',config['candidate_batch'], 'joined_final_matches')
        )
    out['joined'] = joined_final_matches.take(config['search_sample_step'])
    return out
示例#6
0
 def flat(field_to_field):
     flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
     data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))
示例#7
0
    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

    options = options_template.copy()
    options.update(config["kmeans_output"])
    for k, v in options.items():
        if v:
            flat(k)


if __name__ == "__main__":
    if config.get("random_state"):
        config["random_state"] = np.random.RandomState(config["random_state"])
    else:
        config["random_state"] = np.random.RandomState(None)
    import datetime

    started = datetime.datetime.now()
    print("started at:::", started)
    actions = config["actions"]
    make_hdfs_dirs(config)
    if "map_each_image" in actions:
        map_each_image(sc, config, config["input_spec"], hdfs_path(config, "map_each_image", "measures"))
    if "kmeans" in actions:
        kmeans(config)
    if "find_similar" in actions:
        search.find_similar(sc, config)
    ended = datetime.datetime.now()
    print("Elapsed Time (seconds):::", (ended - started).total_seconds(), "\nAt", ended.isoformat())
示例#8
0
 def flat(field_to_field):
     flat_map = partial(flat_map_indicators, config['phash_chunk_len'],
                        kPoints, {field_to_field: True})
     data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
         hdfs_path(config, 'km', field_to_field))
示例#9
0
def kmeans(config):
    """ Kmeans with merging and counting of perceptive hashes and 
    ward hashes among clusters."""
    measures = sc.pickleFile(hdfs_path(config, 'map_each_image', 'measures'))
    data = measures.map(lambda x: (x[1]['id'], flatten_hist_cen(x[1]), x[1][
        'phash'], x[1]['ward'])).cache()
    K = config['n_clusters_group']
    convergeDist = config['kmeans_group_converge']
    sample = data.takeSample(False, K, 1)
    kPoints = [k[1] for k in sample]
    if convergeDist is not None:
        tempDist = 10 * convergeDist
    else:
        tempDist = 1e12
    idx = 0
    within_set_sse = []
    while tempDist > convergeDist:
        max_len = config['in_memory_set_len'] / K
        ward_max_len = int(.5 * max_len)
        phash_max_len = int(max_len - ward_max_len)
        closest = data.map(partial(km_map, kPoints))
        pointStats = closest.reduceByKey(
            partial(reduce_dist, ward_max_len, phash_max_len))
        pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)):
                                        (x, (y / z, u, w)))
        if convergeDist is not None:
            tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum(
                (kPoints[x] - y)**2)).sum()
        newPoints = pts_hash_union.map(
            lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect()
        idx += 1
        if idx > config['max_iter_group']:
            break
        print('kmeans did iteration: ', idx, file=sys.stderr)
    for (x, y) in newPoints:
        kPoints[x] = y
    phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u)
    phash_unions.saveAsPickleFile(hdfs_path(config, 'km', 'phash_unions'))
    ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w)
    ward_unions.saveAsPickleFile(hdfs_path(config, 'km', 'ward_unions'))
    # The rest of the function deals with writing various lookup tables.

    # save the fit data and the meta stats as a single item in list
    kpsave = sc.parallelize([
        kPoints,
        tempDist,
        within_set_sse,
    ])
    kpsave.saveAsPickleFile(hdfs_path(config, 'km', 'cluster_center_meta'))

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config['phash_chunk_len'],
                           kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
            hdfs_path(config, 'km', field_to_field))

    options = options_template.copy()
    options.update(config['kmeans_output'])
    for k, v in options.items():
        if v:
            flat(k)
示例#10
0
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
            hdfs_path(config, 'km', field_to_field))

    options = options_template.copy()
    options.update(config['kmeans_output'])
    for k, v in options.items():
        if v:
            flat(k)


if __name__ == "__main__":
    if config.get('random_state'):
        config['random_state'] = np.random.RandomState(config['random_state'])
    else:
        config['random_state'] = np.random.RandomState(None)
    import datetime
    started = datetime.datetime.now()
    print('started at:::', started)
    actions = config['actions']
    make_hdfs_dirs(config)
    if 'map_each_image' in actions:
        map_each_image(sc, config, config['input_spec'],
                       hdfs_path(config, 'map_each_image', 'measures'))
    if 'kmeans' in actions:
        kmeans(config)
    if 'find_similar' in actions:
        search.find_similar(sc, config)
    ended = datetime.datetime.now()
    print('Elapsed Time (seconds):::', (ended - started).total_seconds(),
          '\nAt', ended.isoformat())