def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num): image_box_fns.setdefault(image_id, []).append( (box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type]) try: shutil.rmtree(output_path) except OSError: pass os.makedirs(output_path) for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num): open(output_path + '/%s' % (x, ), 'w').write(y)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): """ TODO Finish docstring Args: hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local. """ labels = {} try: labels = file_parse.load(local_labels) except IOError: pass if hdfs_output is None: j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in j['output']), []) j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py')) neg_keys = sum((x[1] for x in j['output']), []) else: hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py')) picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys}, 'classifier': classifier, 'classifier_extra': classifier_extra} file_parse.dump(labels, local_labels)
def compute_db_hadoop(self, hdfs_path): import json si = picarus.api.SearchIndex() si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) si.feature = json.dumps(self.feature_dict) # TODO: What to do with the pkl file? with hadoopy_helper.hdfs_temp() as hdfs_output: picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side) # Compute features (map) picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files) # Random sample features for hashes (map) and train hasher (reduce) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1., 'HASH_BITS': 128}) hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1] si.hash = pickle.dumps(hasher, -1) si.hash_format = si.PICKLE # Compute features hashes (map) and build database (reduce) open('hasher.pkl', 'w').write(si.hash) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl']) metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next() self.metadata = metadata si.metadata.extend(metadata.tolist()) self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64)) si.index = pickle.dumps(self.index, -1) si.index_format = si.PICKLE open('index.pb', 'w').write(si.SerializeToString())
def report_clusters(hdfs_input, category, make_faces, **kw): """ NOTE: This transfers much more image data than is necessary! Really this operation should be done directly on hdfs """ def make_face_image(facestr): name, ext = os.path.splitext(facestr) m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name) hash, l, t, r, b = m.groups() l, t, r, b = map(int, (l, t, r, b)) #m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name) #hash, l, t, r, b = m.groups() return { 'hash': hash, 'categories': ['faces'], 'faces': [{'boundingbox': ((l, t), (r, b))}], 'video': [], } # Collect all the clusters as a set of lists clusters = {} cluster_samples = {} def update(cluster_index, image_name, clusters): cluster = clusters.setdefault(cluster_index, []) if make_faces: face_image = make_face_image(image_name) cluster.append(face_image) else: cluster.append({ 'hash': image_name, 'categories': [category], 'faces': [], 'video': [], }) for cluster_index, (image_name, _) in hadoopy.readtb(hdfs_input + '/partition'): update(cluster_index, image_name, clusters) for cluster_index, (image_name, _) in hadoopy.readtb(hdfs_input + '/samples'): update(cluster_index, image_name, cluster_samples) # Gather each cluster clusters = [{ # Sample images uniformly 'sample_images': samples, 'all_images': images, 'size': len(images), 'children': [], 'std': 0.0, 'position': [0.0, 0.0], } for ((_, images), (_, samples)) in zip(sorted(clusters.items()), sorted(cluster_samples.items()))] report = {category: clusters} return report
def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num): image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type]) try: shutil.rmtree(output_path) except OSError: pass os.makedirs(output_path) for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num): open(output_path + '/%s' % (x,), 'w').write(y)
def run_hac(hdfs_input, **kw): import scipy as sp import scipy.cluster import scipy.spatial.distance x = np.array([x[1] for x in hadoopy.readtb(hdfs_input)]) y = sp.spatial.distance.pdist(x) return sp.cluster.hierarchy.linkage(y)
def main2(): exemplar_name = 'e05c099586f744a6d9e70b334e79da08-[0.5217391304347826, 0.0, 0.8695652173913043, 0.9523809523809523]' path = 'exemplarbank/output/1341790878.92/val_pred_pos_kern2' exemplars = pickle.load(open('exemplars.pkl')) exemplar_path = 'exemplars' exemplar_ids = {} exemplar_num = None for exemplar_num, ((image_id, box, _), _) in enumerate(exemplars): if exemplar_name == '%s-%s' % (image_id, box): break for y, x in enumerate(exemplars): x = x[0][0] exemplar_ids.setdefault(x, []).append(y) try: shutil.rmtree('hik_pairs_specific') except OSError: pass os.makedirs('hik_pairs_specific') pq = LeakyPriorityQueue(100) for (kernel, row_num), columns in hadoopy.readtb(path): if kernel != 'hik' or row_num != exemplar_num: continue print(row_num) # Blacklist all exemplars from the same image columns[exemplar_ids[exemplars[row_num][0][0]]] = -np.inf for column_num, val in enumerate(columns[:row_num]): pq.add(-val, (row_num, column_num)) for num, (score, (row_num, max_col)) in enumerate(pq.items_sorted()): shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[row_num][0]), 'hik_pairs_specific/%.5d-a-%f.png' % (num, -score)) shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[max_col][0]), 'hik_pairs_specific/%.5d-b-%f.png' % (num, -score))
def main(): path = 'exemplarbank/output/1341790878.92/val_pred_pos_kern2' exemplars = pickle.load(open('exemplars.pkl')) exemplar_path = 'exemplars' exemplar_ids = {} for y, x in enumerate(exemplars): x = x[0][0] exemplar_ids.setdefault(x, []).append(y) try: shutil.rmtree('hik_pairs') except OSError: pass os.makedirs('hik_pairs') pq = LeakyPriorityQueue(100) for (kernel, row_num), columns in hadoopy.readtb(path): if kernel != 'hik': continue print(row_num) # Blacklist all exemplars from the same image columns[exemplar_ids[exemplars[row_num][0][0]]] = -np.inf for column_num, val in enumerate(columns[:row_num]): pq.add(-val, (row_num, column_num)) for num, (score, (row_num, max_col)) in enumerate(pq.items_sorted()): shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[row_num][0]), 'hik_pairs/%.5d-a-%f.png' % (num, -score)) shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[max_col][0]), 'hik_pairs/%.5d-b-%f.png' % (num, -score))
def test_err(self): nonsense_path = 'sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk' self.assertFalse(hadoopy.exists(nonsense_path)) self.assertEquals( hadoopy.abspath(nonsense_path).rsplit('/')[-1], nonsense_path) self.assertRaises(IOError, hadoopy.ls, nonsense_path) self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def save_display_images(path_hdfs, path_local, min_count, max_count, key_to_path=None): """ Saves the first max_count images obtained by calling hadoopy.readtb(path_hdfs). Each item in the sequence is assumed to be of the form (key, (imagedata, boxes)). The boxes are drawn on each image before it is saved to the local path. If key_to_path is provided, which maps a key to a path, the image corresponding to that key will be saved in key_to_path[key]. """ if key_to_path == None: key_to_path = {} count = 0 for k, (i, bs) in hadoopy.readtb(path_hdfs): if count >= min_count: if k in key_to_path: path = key_to_path[k] else: path = path_local filename = '%s/%s.jpg' % (path, k) im = imfeat.convert_image(Image.open(StringIO.StringIO(i)), [('opencv', 'bgr', 8)]) print(k) for b in bs: cv.Rectangle(im, (b[0], b[1]), (b[2], b[3]), cv.CV_RGB(255, 0, 0), 3) cv.SaveImage(filename, im) # update count and break loop if necessary # TODO(Vlad): can we slice notation on a list of generators? count += 1 if count > max_count: break
def main(): exemplars = sorted(pickle.load(open('exemplars.pkl')), key=lambda x: x[0][2], reverse=True)[:100] with open('exemplars_best.pkl', 'w') as fp: pickle.dump(exemplars, fp, -1) hdfs_output = 'exemplarbank/output/%s/' % '1341790878.92' #hadoopy.launch_frozen('/user/brandyn/aladdin_results/keyframe/9/keyframe', hdfs_output + 'frame_pred', 'predict_video_frame.py', cmdenvs=['EXEMPLARS=exemplars_best.pkl', 'CELL_SKIP=1'], remove_output=True, files=['exemplars_best.pkl']) local_out = 'frame_preds/' try: shutil.rmtree(local_out) except OSError: pass os.makedirs(local_out) for num, (data, (pyramid, num_boxes)) in enumerate(hadoopy.readtb(hdfs_output + 'frame_pred')): if np.sum(pyramid): pyramid_norm = pyramid / float(num_boxes) pyramid_prob = np.sqrt(pyramid / float(np.max(pyramid))) p = np.sum(pyramid_norm) f = imfeat.image_fromstring(data['frame']) pyramid_prob_frame = cv2.resize(pyramid_prob, (f.shape[1], f.shape[0])) pyramid_prob_frame_color = COLORS[(pyramid_prob_frame * 255).astype(np.int), :] alpha = .5 beta = alpha * pyramid_prob_frame beta = beta.reshape((beta.shape[0], beta.shape[1], 1)) else: beta = 0. f = ((1 - beta) * f + beta * pyramid_prob_frame_color).astype(np.uint8) print(p) open(local_out + '%f-%d.jpg' % (p, num), 'w').write(imfeat.image_tostring(f, 'jpg'))
def __init__(self, input_path, output_path, temp_path): self.input_path = input_path self.output_path = output_path self.temp_path = temp_path self.vect_1 = {} for k, v in hadoopy.readtb(self.input_path): self.vect_1[k] = v
def run_video_frame_classification(train_dir): try: neg_dir = train_dir + '/0' pos_dir = train_dir + '/1' while 1: # Train using initial pos/neg c = vidfeat.SyntheticFrameFeature().train(vidfeat.load_label_frames(train_dir)) # Predict on dataset hdfs_input = random.sample(hadoopy.ls('/user/brandyn/aladdin/mp4_devt/'), 96) start_time = '%f' % time.time() hdfs_output = '/user/brandyn/aladdin_results/video_grep/%s' % start_time picarus.vision.run_video_grep_frames(hdfs_input, hdfs_output, c) unsorted_dir = tempfile.mkdtemp() try: for _, y in hadoopy.readtb(hdfs_output): open('%s/%s.jpg' % (unsorted_dir, hashlib.sha1(y).hexdigest()), 'w').write(y) # Present results to user and add to list try: cmd = 'python -m interactive_learning.image_selector %s %s %s --port 8083' % (unsorted_dir, pos_dir, neg_dir) print(cmd) subprocess.call(cmd.split()) except OSError: pass finally: shutil.rmtree(unsorted_dir) finally: #shutil.rmtree(temp_root) pass
def calibrate(hdfs_input, hdfs_output): # Predict on pos/neg sets hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) # Calibrate threshold using pos/neg validation set #1 hadoopy.launch_frozen([ hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1' ], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): labels = {} try: labels = file_parse.load(local_labels) except IOError: pass hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' hadoopy.launch_frozen(hdfs_input_pos, hdfs_output_pos, 'collect_keys.py') hadoopy.launch_frozen(hdfs_input_neg, hdfs_output_neg, 'collect_keys.py') pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys}, 'classifier': classifier, 'classifier_extra': classifier_extra} file_parse.dump(labels, local_labels)
def dump_local(hdfs_input, local_output, extension='', **kw): """Read data from hdfs and store the contents as hash.ext Args: hdfs_input: HDFS input path in either 'kv' or 'record' format local_output: Local directory output path extension: Use this file extension if none available (kv format or record with missing extension) (default '') """ try: os.makedirs(local_output) except OSError: pass for k, v in hadoopy.readtb(hdfs_input): if not isinstance(k, str): raise ValueError("Key must be a string. If you are reading data in 'record' form use the 'records' file and not the directory it is in.") if isinstance(v, dict): # record try: extension = '.' + v['extension'] if v['extension'] else extension except KeyError: pass _record_to_file(v, os.path.join(local_output, k + extension)) else: out_path = os.path.join(local_output, k + ('.' + extension if extension else '')) with open(out_path, 'wb') as fp: fp.write(v)
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None): """Return an iterator of m kv pairs selected uniformly from the input Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p. If more kv pairs are returned from Hadoop, then they are ignored. The resulting kv pairs are uniformly random from the input. Args: m: Desired number of samples (you will get this many as long as n >= m with probability (1-p)) n: Number of total values (default None uses count_kvs to compute this) p: Failure probability (default .01 means there is 1 failure out of 100 runs) Yields: Sample k/v pairs """ if n is None: n = count_kvs(hdfs_input) alpha = _random_sample_alpha(n, m, p=p) num_outputs = 0 with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output: hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'), cmdenvs={'ALPHA': alpha}) for kv in hadoopy.readtb(hdfs_output): if num_outputs >= m: return yield kv num_outputs += 1 if num_outputs < m: logging.warn('random_sampler: num_outputs[%d] when m[%d]. To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
def initial_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True) # Compute desired probability num_val = 5000 num_neg_train = 5000 toggle_launch() if 0: neg_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train)) neg_samples = [x[1] for x in neg_samples] with open('neg_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[num_val:]), fp, -1) with open('neg_val_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[:num_val]), fp, -1) del neg_samples gc.collect() pos_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'pos', num_val / 2)) # Twice as many neg as positive pos_samples = [x[1] for x in pos_samples] with open('pos_val_feats.pkl', 'w') as fp: pickle.dump(np.array(pos_samples), fp, -1) del pos_samples gc.collect() cmdenvs = {'NEG_FEATS': 'neg_feats.pkl', 'POS_VAL_FEATS': 'pos_val_feats.pkl', 'NEG_VAL_FEATS': 'neg_val_feats.pkl'} files = cmdenvs.values() cmdenvs['SAMPLE_SIZE'] = 1000 hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py', cmdenvs=cmdenvs, remove_output=True, files=files) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def report_categories(hdfs_join_predictions_input, local_output, image_limit, local_thumb_output, **kw): # Output a cluster for each category # FIXME This is hardcoded for indoor_outdoor, it will have to change when # there are multiple classifiers (indoor, outdoor, photos, documents, etc) hashes = {-1: [], 1: []} totals = {-1: 0, 1: 0} # First pass: find images for each category for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_join_predictions_input): for classifier, preds in classifier_preds.items(): posname, negname = classifier.split('_') for conf, label in preds: totals[label] += 1 if len(hashes[label]) < image_limit: heapq.heappush(hashes[label], (conf, image_hash)) else: heapq.heappushpop(hashes[label], (conf, image_hash)) print negname, len(hashes[-1]), totals[-1] print posname, len(hashes[1]), totals[1] categories = {} categories[posname] = report_output.make_random_clusters([h for _, h in hashes[1]], posname) categories[negname] = report_output.make_random_clusters([h for _, h in hashes[-1]], negname) try: os.makedirs(os.path.dirname(local_output)) except OSError: pass file_parse.dump(categories, local_output) # Second pass: make image thumbnails if local_thumb_output: try: os.makedirs(local_thumb_output) except OSError: pass hashset = set([h for _, h in hashes[-1] + hashes[1]]) for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_join_predictions_input): if image_hash in hashset: s = StringIO.StringIO() s.write(image_data) s.seek(0) frame = Image.open(s) frame.thumbnail((100,100)) path = '%s/%s.jpg' % (local_thumb_output, image_hash) frame.save(path)
def report_clusters(hdfs_input, local_json_output, sample, category, make_faces, **kw): """ NOTE: This transfers much more image data than is necessary! Really this operation should be done directly on hdfs """ def make_face_image(facestr): name, ext = os.path.splitext(facestr) m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name) print name try: hash, l, t, r, b = m.groups() l,t,r,b = map(int, (l,t,r,b)) #m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name) #hash, l, t, r, b = m.groups() return { 'hash': hash, 'categories': ['faces'], 'faces': [{'boundingbox': ((l,t),(r,b))}], 'video': [], } except: return {} # Collect all the clusters as a set of lists clusters = {} count = 0 for cluster_index, (image_name, _) in hadoopy.readtb(hdfs_input): count += 1 if count % 100 == 0: print count cluster = clusters.setdefault(cluster_index, []) if make_faces: face_image = make_face_image(image_name) cluster.append(face_image) else: cluster.append({ 'hash': image_name, 'categories': [category], 'faces': [], 'video': [], }) # Gather each cluster print len(clusters), 'clusters' clusters = [{ # Sample images uniformly 'sample_images': random.sample(image_set, min(len(image_set), sample)), 'all_images': image_set, 'size': len(image_set), 'children': [], 'std': 0.0, 'position': [0.0, 0.0], } for image_set in clusters.values()] try: os.makedirs(os.path.dirname(local_json_output)) except OSError: pass report = {category: clusters} file_parse.dump(report, local_json_output)
def compute_database(flickr_data): r = 'image_search/%f/' % time.time() f_path = r + 'features/' m_path = r + 'median/' h_path = r + 'hashes/' j_path = r + 'hash_metadata/' hadoopy.launch_frozen(flickr_data, f_path, 'build_features.py') hadoopy.launch_frozen(f_path, m_path, 'calc_median_feature.py') median = np.array([x for _, x in sorted(hadoopy.readtb(m_path))]) pickle.dump(median, open('median.pkl', 'w'), -1) hadoopy.launch_frozen(f_path, h_path, 'compute_hashes.py', files=['median.pkl']) hadoopy.launch_frozen([h_path, flickr_data], j_path, 'join.py', num_reducers=10) hashes, metadatas = zip(*[x[1] for x in hadoopy.readtb(j_path)]) hashes = np.array([x.ravel() for x in hashes]) with open('database.pkl', 'w') as fp: pickle.dump((hashes, metadatas, median), fp, -1)
def read_hdfs_as_generator(path, read_all_at_once=False): """Reads a path at HDFS and returns it line by line as a generator Args: path (strng): HDFS path read_all_at_once Returns: strings (lines of the file) """ if read_all_at_once: lines = [i for i in hadoopy.readtb(path)] for i in lines: yield i else: for i in hadoopy.readtb(path): yield i
def main(): word_counts = dict(hadoopy.readtb(hdfs_index)) for word in word_counts: batch = table_index.batch() for url in word_counts[word]: tfidf = word_counts[word][url] batch.put(word.encode('utf-8'),{"wiki:"+url:str(tfidf).encode('utf-8')}) batch.send()
def extractInfo(file_name): line_raw = dict(hadoopy.readtb(file_name)) line_raw = pd.DataFrame(line_raw.values(), columns=[ 'DATE', 'TIME', 'LINE', 'BUS_NUM', 'X_COORDINATE', 'Y_COORDINATE' ]) line = data_extraction.getCoord(line_raw) return line
def _run_face(self, fn, out_path, **kw): in_path = self.data_path + fn hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def _inner(): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'): for score2, image_id2, box2 in negs: image_box_fns.setdefault(image_id2, []).append((box2, [image_id, box, score])) pickle.dump(image_box_fns, fp, -1) del image_box_fns gc.collect()
def main(): word_counts = dict(hadoopy.readtb(hdfs_index)) for word in word_counts: batch = table_index.batch() for url in word_counts[word]: tfidf = word_counts[word][url] batch.put(word.encode('utf-8'), {"wiki:" + url: str(tfidf).encode('utf-8')}) batch.send()
def _run_face(self, fn, **kw): in_path = self.data_path + fn out_path = "%sout-%s-%f" % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): with open(self.out_path + "img%.8d.jpg" % num, "w") as fp: fp.write(image_data)
def calibrate(hdfs_input, hdfs_output): # Predict on pos/neg sets hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) # Calibrate threshold using pos/neg validation set #1 hadoopy.launch_frozen([hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1'], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def _run_face(self, fn): in_path = self.data_path + fn out_path = self.data_path + 'out-' + fn cmd = 'hadoop fs -put %s %s' % (fn, in_path) subprocess.check_call(cmd.split()) hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False, files=['haarcascade_frontalface_default.xml']) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): with open(self.out_path + 'img%.8d.jpg' % num, 'w') as fp: fp.write(image_data)
def test_tb(path): """ This function tests the sequence file at 'path' (on hdfs) by reading the images from it. """ # test that we can read each file using _load_cv_image for (key, val) in hadoopy.readtb(path): print(key) i = imfeat.convert_image(Image.open(StringIO.StringIO(val)), [('opencv', 'gray', 8)])
def report_video_keyframe(hdfs_input, **kw): videos = {} for (kind, hash), v in hadoopy.readtb(hdfs_input): if kind == 'video': videos[hash] = v if not len(videos): # Sanity check print "No videos returned by readtb(%s). This is probably the wrong keyframe path" % hdfs_input report = {'videos': videos} return report
def exemplar_boxes(hdfs_input, hdfs_output): exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]' st = time.time() exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/' for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'): (image_id, box, score), _ = kv if exemplar_name == '%s-%s' % (image_id, box): print('Found it') with open('exemplars-patch.pkl', 'w') as fp: pickle.dump([kv], fp, -1) hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'], num_reducers=10) hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'], num_reducers=10) with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])] neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])] for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)): image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'}) out_dir = 'exemplars_similar_cropped/' try: shutil.rmtree('exemplars_similar_cropped') except OSError: pass print('Outputting cropped') os.makedirs(out_dir) print(exemplar_out + 'boxes_cropped') for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'): open(out_dir + x, 'w').write(y) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'}) out_dir = 'exemplars_similar/' try: shutil.rmtree('exemplars_similar') except OSError: pass print('Outputting boxes') os.makedirs(out_dir) for x, y in hadoopy.readtb(exemplar_out + 'boxes'): open(out_dir + x, 'w').write(y)
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=[ 'mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000' ], **kw) launcher(in_path, out_path, script_name, jobconfs={ 'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000' }, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % ( script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def _run_hdfs(self, orig_fn): fn = "%f-%s" % (time.time(), orig_fn) file_path = "%s/%s" % (self.data_path, fn) hadoopy.put(orig_fn, file_path) cat_output = [_ for _ in hadoopy.readtb(file_path)] line = (331, "Title: Alice's Adventures in Wonderland") self.assertTrue(line in cat_output) ls_output = hadoopy.ls(self.data_path) self.assertTrue([x for x in ls_output if x.rsplit("/", 1)[-1] == fn]) ls_output = hadoopy.ls(file_path) self.assertTrue(ls_output[0].rsplit("/", 1)[-1] == fn)
def _run_hdfs(self, orig_fn): fn = '%f-%s' % (time.time(), orig_fn) file_path = '%s/%s' % (self.data_path, fn) hadoopy.put(orig_fn, file_path) cat_output = [_ for _ in hadoopy.readtb(file_path)] line = (331, 'Title: Alice\'s Adventures in Wonderland') self.assertTrue(line in cat_output) ls_output = hadoopy.ls(self.data_path) self.assertTrue([x for x in ls_output if x.rsplit('/', 1)[-1] == fn]) ls_output = hadoopy.ls(file_path) self.assertTrue(ls_output[0].rsplit('/', 1)[-1] == fn)
def _inner(): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'): for score2, image_id2, box2 in negs: image_box_fns.setdefault(image_id2, []).append( (box2, [image_id, box, score])) pickle.dump(image_box_fns, fp, -1) del image_box_fns gc.collect()
def compute_db_hadoop(self, hdfs_path): import json si = picarus.api.SearchIndex() si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) si.feature = json.dumps( self.feature_dict) # TODO: What to do with the pkl file? with hadoopy_helper.hdfs_temp() as hdfs_output: picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side) # Compute features (map) picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files) # Random sample features for hashes (map) and train hasher (reduce) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={ 'KV_PROB': 1., 'HASH_BITS': 128 }) hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1] si.hash = pickle.dumps(hasher, -1) si.hash_format = si.PICKLE # Compute features hashes (map) and build database (reduce) open('hasher.pkl', 'w').write(si.hash) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl']) metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next() self.metadata = metadata si.metadata.extend(metadata.tolist()) self.index = image_search.LinearHashDB().store_hashes( hashes, np.arange(len(metadata), dtype=np.uint64)) si.index = pickle.dumps(self.index, -1) si.index_format = si.PICKLE open('index.pb', 'w').write(si.SerializeToString())
def make_training_set(path_hdfs, pos_disp_dir, neg_disp_dir, pos_dir, neg_dir, pos_file, neg_file, max_count): """ Makes a training set by downloading the original images (w/o overlayed boxes) corresponding to the positives and negatives from the display directories. The file lists used as input by opencv_createsamples and by opencv_haartraining are also created. """ key_to_path = {} for (d1, d2) in [(pos_disp_dir, pos_dir), (neg_disp_dir, neg_dir)]: if not os.path.exists(d2): os.makedirs(d2) key_to_path.update([(os.path.splitext( os.path.basename(f))[0], d2) for f in glob.glob('%s/*' % d1)]) # save the bounding boxes in a pickle file in each directory boxes = {pos_dir : {}, neg_dir : {}} # the following two files will contain the list of positive/negative # images for training the OpenCV face detector pos_fp = open(pos_file, 'w') neg_fp = open(neg_file, 'w') count = 0 for k, (i, bs) in hadoopy.readtb(path_hdfs): try: path = key_to_path[k] # save the face bounding boxes for this image boxes[path][k] = bs # save the original image filename = '%s/%s.jpg' % (path, k) print(filename) with open(filename, 'wb') as f: f.write(i) # update the positive/negative training lists if path == pos_dir: pos_fp.write('%s %i' % (filename, len(bs))) for b in bs: pos_fp.write(' %i %i %i %i' % ( b[0], b[1], b[2] - b[0] + 1, b[3] - b[1] + 1)) pos_fp.write('\n') else: neg_fp.write('%s\n' % filename) except KeyError: pass # update count and break loop if necessary # TODO(Vlad): can we slice notation on a list of generators? count += 1 if count > max_count: break pos_fp.close() neg_fp.close() # save the bounding boxes in a pickle file for (path, bs) in boxes.items(): with open('%s/boxes.pkl' % path, 'wb') as f: pickle.dump(bs, f)
def initial_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True) # Compute desired probability num_val = 5000 num_neg_train = 5000 toggle_launch() if 0: neg_samples = list( hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train)) neg_samples = [x[1] for x in neg_samples] with open('neg_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[num_val:]), fp, -1) with open('neg_val_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[:num_val]), fp, -1) del neg_samples gc.collect() pos_samples = list( hadoopy_helper.jobs.random_sample( hdfs_output + 'pos', num_val / 2)) # Twice as many neg as positive pos_samples = [x[1] for x in pos_samples] with open('pos_val_feats.pkl', 'w') as fp: pickle.dump(np.array(pos_samples), fp, -1) del pos_samples gc.collect() cmdenvs = { 'NEG_FEATS': 'neg_feats.pkl', 'POS_VAL_FEATS': 'pos_val_feats.pkl', 'NEG_VAL_FEATS': 'neg_val_feats.pkl' } files = cmdenvs.values() cmdenvs['SAMPLE_SIZE'] = 1000 hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py', cmdenvs=cmdenvs, remove_output=True, files=files) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def fetch_assignments_from_hdfs(hdfs_input): """Fetch remote assignments and store locally Args: hdfs_input: HDFS input path Returns: NamedTemporaryFile holding the assignment data """ assignments_fp = tempfile.NamedTemporaryFile() assignments = list(hadoopy.readtb(hdfs_input)) pickle.dump(assignments, assignments_fp, -1) assignments_fp.seek(0) return assignments_fp
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): """ TODO Finish docstring Args: hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local. """ labels = {} try: labels = file_parse.load(local_labels) except IOError: pass if hdfs_output is None: j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in j['output']), []) j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py')) neg_keys = sum((x[1] for x in j['output']), []) else: hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py')) picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = { 'labels': { '1': pos_keys, '-1': neg_keys }, 'classifier': classifier, 'classifier_extra': classifier_extra } file_parse.dump(labels, local_labels)
def _run_face(self, fn, out_path, **kw): bfn = os.path.basename(fn) in_path = self.data_path + bfn hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def readHDFS(path): data_raw = dict(hadoopy.readtb(path)) coordinate = [] for row in data_raw.itervalues(): if 'Xrec' in row: coordinate.append(row) if coordinate == []: return pd.DataFrame() length = len(sorted(coordinate,key=len, reverse=True)[0]) coordinate_list = [x.encode('UTF8').split(';') for x in coordinate] gps_data=np.array([xi+[None]*(length-len(xi)) for xi in coordinate_list]) gps_data = pd.DataFrame(gps_data) gps_data = gps_data.iloc[:,[0,12,2,7,14,16]] gps_data.columns = ['Date','Time','Line','Bus_num','X_coordinate','Y_coordinate'] return gps_data
def run_predict_windows(hdfs_input, hdfs_classifier_input, feature, hdfs_output, image_height, image_width, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz') file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name) files.append(fp.name) files.append(_lf('data/haarcascade_frontalface_default.xml')) cmdenvs = ['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)] cmdenvs += ['IMAGE_HEIGHT=%d' % image_height, 'IMAGE_WIDTH=%d' % image_width, 'FEATURE=%s' % feature] picarus._launch_frozen(hdfs_input, hdfs_output, _lf('predict_windows.py'), cmdenvs=cmdenvs, files=files, dummy_arg=fp)
def _run_haystack(fn, script_name): cur_time = time.time() hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time print('Storing HDFS temp files and output in [%s]' % hdfs_base_path) in_path = hdfs_base_path + os.path.basename(fn) out_path = hdfs_base_path + 'out-' + os.path.basename(fn) hadoopy.put(fn, in_path) print('Launching job [%s]' % script_name) hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + 'target.jpg']) print('Storing local output in [%s]' % local_out) for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)): open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name), 'w').write(image_data)
def report_thumbnails(hdfs_input, local_thumb_output, **kw): """Collect thumbnails of all images in hdfs://${hdfs_input} """ counter = 0 for image_hash, image_data in hadoopy.readtb(hdfs_input): path = '%s/%s/%s/%s.jpg' % (local_thumb_output, image_hash[:2], image_hash[2:4], image_hash) try: os.makedirs(os.path.dirname(path)) except OSError: pass with open(path, 'w') as f: f.write(image_data) counter += 1 if not counter: print 'There were no images in readtb(%s). This is probably not a thumbnail path' % hdfs_input
def main(): path = 'exemplarbank/output/1341790878.92/val_pred_pos' pyramid, num_boxes = hadoopy.readtb(path).next()[1] try: shutil.rmtree('priors') except OSError: pass os.makedirs('priors') exemplars = pickle.load(open('exemplars.pkl')) for exemplar_num in range(pyramid.shape[0]): print(exemplar_num) p = pyramid[exemplar_num, :, :] / float( np.max(pyramid[exemplar_num, :, :])) p = (p * 255).astype(np.uint8) print p cv2.imwrite( 'priors/%.5d-%.5d.png' % (exemplars[exemplar_num][0][2], exemplar_num), p)
def fetch_clusters_from_hdfs(hdfs_input): """Fetch remote clusters and store locally Clusters are sorted to allow comparing between iterations Args: hdfs_input: HDFS input path Returns: NamedTemporaryFile holding the cluster data """ clusters_fp = tempfile.NamedTemporaryFile() clusters = [v.tolist() for k, v in hadoopy.readtb(hdfs_input)] clusters.sort() clusters = np.ascontiguousarray(clusters, dtype=np.float64) pickle.dump(clusters, clusters_fp, -1) clusters_fp.seek(0) return clusters_fp
def main(): exemplar_feats = list( hadoopy.readtb('exemplarbank/output/1341790878.92/pos_sample')) feats = np.vstack([x[1] for x in exemplar_feats]) print(feats.shape) try: shutil.rmtree('clusters') except OSError: pass os.makedirs('clusters') for exemplar_num, cluster_num in enumerate( sp.cluster.vq.kmeans2(feats, 20, minit='points')[1]): fn = _find_exemplar_fn('exemplars', exemplar_feats[exemplar_num][0]) cluster_path = 'clusters/%d/' % cluster_num try: os.makedirs(cluster_path) except OSError: pass shutil.copy(fn, cluster_path)
def hard_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg', 'hard_predictions.py', cmdenvs=[ 'EXEMPLARS=exemplars.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box' ], num_reducers=10, files=['exemplars.pkl'], remove_output=True) def _inner(): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'): for score2, image_id2, box2 in negs: image_box_fns.setdefault(image_id2, []).append( (box2, [image_id, box, score])) pickle.dump(image_box_fns, fp, -1) del image_box_fns gc.collect() _inner() hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg_clip', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature']) hadoopy.launch_frozen( [hdfs_output + 'pos_sample', hdfs_output + 'hard_neg_clip'], hdfs_output + 'exemplars-1', 'train_exemplars_hard.py', cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'], files=['neg_feats.pkl'], remove_output=True, num_reducers=10) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)