def _best_cluster(x): counts =[] for ward_or_phash, unions in pw: counts.append([]) for u in unions: p = 0 for item in x[1][ward_or_phash]: p += u.get(item, 0) counts[-1].append(p) best = list(map(np.argmax, counts)) distances = [np.sum((kPoints[i] - flatten_hist_cen(x[1]))**2) for i in range(len(kPoints))] best.append(np.argmin(distances)) # TODO I think the following line has a typo distances[b] is what is should be. return [(x[0], (b, 'self', distances[best[-1]], x[1]['ward'], x[1]['phash'])) for b in best]
def kmeans(config): """ Kmeans with merging and counting of perceptive hashes and ward hashes among clusters.""" measures = sc.pickleFile(hdfs_path(config, "map_each_image", "measures")) data = measures.map(lambda x: (x[1]["id"], flatten_hist_cen(x[1]), x[1]["phash"], x[1]["ward"])).cache() K = config["n_clusters_group"] convergeDist = config["kmeans_group_converge"] sample = data.takeSample(False, K, 1) kPoints = [k[1] for k in sample] tempDist = 10 * convergeDist idx = 0 within_set_sse = [] while tempDist > convergeDist: max_len = config["in_memory_set_len"] / K ward_max_len = int(0.5 * max_len) phash_max_len = int(max_len - ward_max_len) closest = data.map(partial(km_map, kPoints)) pointStats = closest.reduceByKey(partial(reduce_dist, ward_max_len, phash_max_len)) pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w))) tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum((kPoints[x] - y) ** 2)).sum() newPoints = pts_hash_union.map(lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect() idx += 1 if idx > config["max_iter_group"]: break print("kmeans did iteration: ", idx, file=sys.stderr) for (x, y) in newPoints: kPoints[x] = y phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u) phash_unions.saveAsPickleFile(hdfs_path(config, "km", "phash_unions")) ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w) ward_unions.saveAsPickleFile(hdfs_path(config, "km", "ward_unions")) # The rest of the function deals with writing various lookup tables. # save the fit data and the meta stats as a single item in list kpsave = sc.parallelize([kPoints, tempDist, within_set_sse]) kpsave.saveAsPickleFile(hdfs_path(config, "km", "cluster_center_meta")) def flat(field_to_field): flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field)) options = options_template.copy() options.update(config["kmeans_output"]) for k, v in options.items(): if v: flat(k)
def kmeans(config): """ Kmeans with merging and counting of perceptive hashes and ward hashes among clusters.""" measures = sc.pickleFile(hdfs_path(config, 'map_each_image', 'measures')) data = measures.map(lambda x: (x[1]['id'], flatten_hist_cen(x[1]), x[1][ 'phash'], x[1]['ward'])).cache() K = config['n_clusters_group'] convergeDist = config['kmeans_group_converge'] sample = data.takeSample(False, K, 1) kPoints = [k[1] for k in sample] if convergeDist is not None: tempDist = 10 * convergeDist else: tempDist = 1e12 idx = 0 within_set_sse = [] while tempDist > convergeDist: max_len = config['in_memory_set_len'] / K ward_max_len = int(.5 * max_len) phash_max_len = int(max_len - ward_max_len) closest = data.map(partial(km_map, kPoints)) pointStats = closest.reduceByKey( partial(reduce_dist, ward_max_len, phash_max_len)) pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w))) if convergeDist is not None: tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum( (kPoints[x] - y)**2)).sum() newPoints = pts_hash_union.map( lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect() idx += 1 if idx > config['max_iter_group']: break print('kmeans did iteration: ', idx, file=sys.stderr) for (x, y) in newPoints: kPoints[x] = y phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u) phash_unions.saveAsPickleFile(hdfs_path(config, 'km', 'phash_unions')) ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w) ward_unions.saveAsPickleFile(hdfs_path(config, 'km', 'ward_unions')) # The rest of the function deals with writing various lookup tables. # save the fit data and the meta stats as a single item in list kpsave = sc.parallelize([ kPoints, tempDist, within_set_sse, ]) kpsave.saveAsPickleFile(hdfs_path(config, 'km', 'cluster_center_meta')) def flat(field_to_field): flat_map = partial(flat_map_indicators, config['phash_chunk_len'], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile( hdfs_path(config, 'km', field_to_field)) options = options_template.copy() options.update(config['kmeans_output']) for k, v in options.items(): if v: flat(k)