def kmeans(config): """ Kmeans with merging and counting of perceptive hashes and ward hashes among clusters.""" measures = sc.pickleFile(hdfs_path(config, "map_each_image", "measures")) data = measures.map(lambda x: (x[1]["id"], flatten_hist_cen(x[1]), x[1]["phash"], x[1]["ward"])).cache() K = config["n_clusters_group"] convergeDist = config["kmeans_group_converge"] sample = data.takeSample(False, K, 1) kPoints = [k[1] for k in sample] tempDist = 10 * convergeDist idx = 0 within_set_sse = [] while tempDist > convergeDist: max_len = config["in_memory_set_len"] / K ward_max_len = int(0.5 * max_len) phash_max_len = int(max_len - ward_max_len) closest = data.map(partial(km_map, kPoints)) pointStats = closest.reduceByKey(partial(reduce_dist, ward_max_len, phash_max_len)) pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w))) tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum((kPoints[x] - y) ** 2)).sum() newPoints = pts_hash_union.map(lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect() idx += 1 if idx > config["max_iter_group"]: break print("kmeans did iteration: ", idx, file=sys.stderr) for (x, y) in newPoints: kPoints[x] = y phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u) phash_unions.saveAsPickleFile(hdfs_path(config, "km", "phash_unions")) ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w) ward_unions.saveAsPickleFile(hdfs_path(config, "km", "ward_unions")) # The rest of the function deals with writing various lookup tables. # save the fit data and the meta stats as a single item in list kpsave = sc.parallelize([kPoints, tempDist, within_set_sse]) kpsave.saveAsPickleFile(hdfs_path(config, "km", "cluster_center_meta")) def flat(field_to_field): flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field)) options = options_template.copy() options.update(config["kmeans_output"]) for k, v in options.items(): if v: flat(k)
def find_similar(sc, config): """Use cluster to hash and hash to key joins to find_similar images. TODO: more rounds of search, and have an option to do ward OR perceptive hash OR both. Ward is more expansive (false positives) than perceptive hashes, so the join can get slow with many matches. Maybe ward hashes should be a second try.""" kmeans_meta = sc.pickleFile(hdfs_path(config, 'km','cluster_center_meta')) kmeans_meta = kmeans_meta.map(lambda x:x).collect() kPoints, tempDist, within_set_sse = kmeans_meta phash_unions = sc.pickleFile( hdfs_path(config, 'km', 'phash_unions') ).map( lambda x:x ).collect() ward_unions = sc.pickleFile( hdfs_path(config, 'km', 'ward_unions') ).map(lambda x:x).collect() if not config.get('candidate_has_mapped'): scores = map_each_image(sc, config, config['candidate_spec'], config['candidate_measures_spec']) else: scores = sc.pickleFile(config['candidate_measures_spec']) scores.cache() for net_round in range(config['search_rounds']): samples = join_nearest(sc, config, kPoints, phash_unions, ward_unions, scores) #TODO logic here for more rounds of sampling return samples
def fuzzify(config, fname, hdfs_name): from PIL import Image img = Image.open(fname) n = np.array(img) for i in range(0, n.shape[0] - filterx, filterx): for j in range(0, n.shape[1] - filtery, filtery): if random.uniform(0, 1) < change_perc: for z in range(3): n[i:i + filterx, j:j + filtery, z] = np.median(n[i:i + filterx, j:j + filtery, z]) new = Image.fromarray(np.array(np.round(n), dtype=np.uint8)) loc_name = fname + 'fuz' new.save(loc_name, format="png") print( sp.Popen([ 'hadoop', 'fs', '-put', loc_name, hdfs_path(config, config['fuzzy_example_data'], hdfs_name) ]).communicate())
def fuzzify(config, fname, hdfs_name): from PIL import Image img = Image.open(fname) n = np.array(img) for i in range(0,n.shape[0] -filterx, filterx): for j in range(0,n.shape[1]-filtery, filtery): if random.uniform(0, 1) < change_perc: for z in range(3): n[i: i+filterx, j:j+filtery,z] = np.median(n[i:i+filterx,j:j+filtery,z]) new = Image.fromarray(np.array(np.round(n), dtype=np.uint8)) loc_name = fname + 'fuz' new.save(loc_name,format="png") print(sp.Popen(['hadoop', 'fs', '-put', loc_name, hdfs_path(config, config['fuzzy_example_data'], hdfs_name)]).communicate())
def join_nearest(sc, config, kPoints, phash_unions, ward_unions, scores): """Use candidates' scores to assign them to best clusters based on euclidean distance and number of matching hashes, ward or perceptive. Join those assigned clusters to perceptive and ward hashes from training and then join hashes to keys.""" pw = tuple(zip(('phash', 'ward'), (phash_unions, ward_unions))) def _best_cluster(x): counts =[] for ward_or_phash, unions in pw: counts.append([]) for u in unions: p = 0 for item in x[1][ward_or_phash]: p += u.get(item, 0) counts[-1].append(p) best = list(map(np.argmax, counts)) distances = [np.sum((kPoints[i] - flatten_hist_cen(x[1]))**2) for i in range(len(kPoints))] best.append(np.argmin(distances)) # TODO I think the following line has a typo distances[b] is what is should be. return [(x[0], (b, 'self', distances[best[-1]], x[1]['ward'], x[1]['phash'])) for b in best] best_clusters = scores.flatMap(_best_cluster) best_clusters best_clusters.sortBy(lambda x:x[1][2]).cache() phash_c = best_clusters.flatMap( partial(cluster_chunk, config, 'phash')) phash_c_id = phash_c.map(lambda x:x[1]) ward_c = best_clusters.flatMap(partial(cluster_chunk, config, 'ward')) ward_c_id = ward_c.map(lambda x:x[1]) cluster_to_phash = sc.pickleFile(hdfs_path(config, 'km', 'cluster_to_phash')) cluster_to_ward = sc.pickleFile(hdfs_path(config, 'km', 'cluster_to_ward')) rdds = ( ward_c, phash_c) rdds2 = (ward_c_id, phash_c_id) table_names = ('ward_matches','phash_matches') labels = ('ward_to_key','phash_to_key') out = {} to_join = [] for table, rdd, rdd2, label in zip(table_names, rdds, rdds2, labels): join_on_cluster = rdd.join( cluster_to_phash if table == 'phash_matches' else cluster_to_ward ) map_ward_or_phash = join_on_cluster.map(lambda x:(x[1][0][0], x)) to_key = sc.pickleFile(hdfs_path(config, 'km', label)) hash_joined = map_ward_or_phash.join( to_key ) hash_joined2 = rdd2.join(to_key) # pulling the two image keys out into pairs cand_key_to_key = hash_joined.map( lambda x: (x[1][0][1][0][1], x[1][-1]) ) samp = cand_key_to_key.take(config['search_sample_step']) out[table] = samp as_key_counts = cand_key_to_key.groupByKey( ).map( count_keys ) as_key_counts.cache() as_key_counts.saveAsPickleFile( hdfs_path(config, 'candidates', config['candidate_batch'], "%s_counts" % label) ) to_join.append(as_key_counts) # map the candidate id with best match of a hash with indicators of fit def map_best(x): """The key, (best agreeing key, vote count for agreeing, total votes) """ (key, ((best_match, agree_count), dict_)) = x return (key, (best_match, agree_count, sum(dict_.values()))) # join the ward best key with phash best key joined_final_matches = to_join[0].map( map_best ).join( to_join[1].map(map_best) ) joined_final_matches.saveAsPickleFile( hdfs_path(config, 'candidates',config['candidate_batch'], 'joined_final_matches') ) out['joined'] = joined_final_matches.take(config['search_sample_step']) return out
def flat(field_to_field): flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))
def flat(field_to_field): flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field)) options = options_template.copy() options.update(config["kmeans_output"]) for k, v in options.items(): if v: flat(k) if __name__ == "__main__": if config.get("random_state"): config["random_state"] = np.random.RandomState(config["random_state"]) else: config["random_state"] = np.random.RandomState(None) import datetime started = datetime.datetime.now() print("started at:::", started) actions = config["actions"] make_hdfs_dirs(config) if "map_each_image" in actions: map_each_image(sc, config, config["input_spec"], hdfs_path(config, "map_each_image", "measures")) if "kmeans" in actions: kmeans(config) if "find_similar" in actions: search.find_similar(sc, config) ended = datetime.datetime.now() print("Elapsed Time (seconds):::", (ended - started).total_seconds(), "\nAt", ended.isoformat())
def flat(field_to_field): flat_map = partial(flat_map_indicators, config['phash_chunk_len'], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile( hdfs_path(config, 'km', field_to_field))
def kmeans(config): """ Kmeans with merging and counting of perceptive hashes and ward hashes among clusters.""" measures = sc.pickleFile(hdfs_path(config, 'map_each_image', 'measures')) data = measures.map(lambda x: (x[1]['id'], flatten_hist_cen(x[1]), x[1][ 'phash'], x[1]['ward'])).cache() K = config['n_clusters_group'] convergeDist = config['kmeans_group_converge'] sample = data.takeSample(False, K, 1) kPoints = [k[1] for k in sample] if convergeDist is not None: tempDist = 10 * convergeDist else: tempDist = 1e12 idx = 0 within_set_sse = [] while tempDist > convergeDist: max_len = config['in_memory_set_len'] / K ward_max_len = int(.5 * max_len) phash_max_len = int(max_len - ward_max_len) closest = data.map(partial(km_map, kPoints)) pointStats = closest.reduceByKey( partial(reduce_dist, ward_max_len, phash_max_len)) pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w))) if convergeDist is not None: tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum( (kPoints[x] - y)**2)).sum() newPoints = pts_hash_union.map( lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect() idx += 1 if idx > config['max_iter_group']: break print('kmeans did iteration: ', idx, file=sys.stderr) for (x, y) in newPoints: kPoints[x] = y phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u) phash_unions.saveAsPickleFile(hdfs_path(config, 'km', 'phash_unions')) ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w) ward_unions.saveAsPickleFile(hdfs_path(config, 'km', 'ward_unions')) # The rest of the function deals with writing various lookup tables. # save the fit data and the meta stats as a single item in list kpsave = sc.parallelize([ kPoints, tempDist, within_set_sse, ]) kpsave.saveAsPickleFile(hdfs_path(config, 'km', 'cluster_center_meta')) def flat(field_to_field): flat_map = partial(flat_map_indicators, config['phash_chunk_len'], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile( hdfs_path(config, 'km', field_to_field)) options = options_template.copy() options.update(config['kmeans_output']) for k, v in options.items(): if v: flat(k)
data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile( hdfs_path(config, 'km', field_to_field)) options = options_template.copy() options.update(config['kmeans_output']) for k, v in options.items(): if v: flat(k) if __name__ == "__main__": if config.get('random_state'): config['random_state'] = np.random.RandomState(config['random_state']) else: config['random_state'] = np.random.RandomState(None) import datetime started = datetime.datetime.now() print('started at:::', started) actions = config['actions'] make_hdfs_dirs(config) if 'map_each_image' in actions: map_each_image(sc, config, config['input_spec'], hdfs_path(config, 'map_each_image', 'measures')) if 'kmeans' in actions: kmeans(config) if 'find_similar' in actions: search.find_similar(sc, config) ended = datetime.datetime.now() print('Elapsed Time (seconds):::', (ended - started).total_seconds(), '\nAt', ended.isoformat())