def run_face_finder(hdfs_input, hdfs_output, image_length, boxes, **kw): cmdenvs = ['IMAGE_LENGTH=%d' % image_length] if boxes: cmdenvs.append('OUTPUT_BOXES=True') hadoopy.launch_frozen(hdfs_input, hdfs_output, 'face_finder.py', reducer=False, cmdenvs=cmdenvs, files=['haarcascade_frontalface_default.xml'])
def main(): # in_path = '/tmp/bwhite/input/1266413011.32-003-fn-data' # out_path = '/tmp/bwhite/output/tp/face/run-%f' % time.time() in_path = '/user/brandyn/flickr/voc07/flickr_photo_conv4-1297714349.074514' out_path = '/user/brandyn/tp/facefinder/run-%f' % time.time() hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False, files=['haarcascade_frontalface_default.xml'])
def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num): image_box_fns.setdefault(image_id, []).append( (box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type]) try: shutil.rmtree(output_path) except OSError: pass os.makedirs(output_path) for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num): open(output_path + '/%s' % (x, ), 'w').write(y)
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None): """Return an iterator of m kv pairs selected uniformly from the input Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p. If more kv pairs are returned from Hadoop, then they are ignored. The resulting kv pairs are uniformly random from the input. Args: m: Desired number of samples (you will get this many as long as n >= m with probability (1-p)) n: Number of total values (default None uses count_kvs to compute this) p: Failure probability (default .01 means there is 1 failure out of 100 runs) Yields: Sample k/v pairs """ if n is None: n = count_kvs(hdfs_input) alpha = _random_sample_alpha(n, m, p=p) num_outputs = 0 with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output: hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'), cmdenvs={'ALPHA': alpha}) for kv in hadoopy.readtb(hdfs_output): if num_outputs >= m: return yield kv num_outputs += 1 if num_outputs < m: logging.warn('random_sampler: num_outputs[%d] when m[%d]. To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
def compute_db_hadoop(self, hdfs_path): import json si = picarus.api.SearchIndex() si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) si.feature = json.dumps(self.feature_dict) # TODO: What to do with the pkl file? with hadoopy_helper.hdfs_temp() as hdfs_output: picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side) # Compute features (map) picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files) # Random sample features for hashes (map) and train hasher (reduce) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1., 'HASH_BITS': 128}) hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1] si.hash = pickle.dumps(hasher, -1) si.hash_format = si.PICKLE # Compute features hashes (map) and build database (reduce) open('hasher.pkl', 'w').write(si.hash) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl']) metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next() self.metadata = metadata si.metadata.extend(metadata.tolist()) self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64)) si.index = pickle.dumps(self.index, -1) si.index_format = si.PICKLE open('index.pb', 'w').write(si.SerializeToString())
def cluster(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pred_pos', 'predict_spatial_pyramid_fine.py', cmdenvs=['EXEMPLARS=exemplars.pkl'], remove_output=True, files=['exemplars.pkl'], num_reducers=1)
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") files.append(local_labels) hadoopy.launch_frozen(hdfs_input, hdfs_output, 'train_classifier.py', files=files, cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)])
def _run_face(self, fn, **kw): in_path = self.data_path + fn out_path = "%sout-%s-%f" % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): with open(self.out_path + "img%.8d.jpg" % num, "w") as fp: fp.write(image_data)
def _run_face(self, fn, out_path, **kw): in_path = self.data_path + fn hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def _run_face(self, fn): in_path = self.data_path + fn out_path = self.data_path + 'out-' + fn cmd = 'hadoop fs -put %s %s' % (fn, in_path) subprocess.check_call(cmd.split()) hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False, files=['haarcascade_frontalface_default.xml']) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): with open(self.out_path + 'img%.8d.jpg' % num, 'w') as fp: fp.write(image_data)
def run_thresh_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, class_name, class_thresh, output_class, **kw): inputs = [hdfs_predictions_input] if isinstance(hdfs_input, list): inputs += hdfs_input else: inputs.append(hdfs_input) hadoopy.launch_frozen(inputs, hdfs_output, 'thresh_predictions.py', cmdenvs=['CLASSIFIER_NAME=%s' % class_name, 'CLASSIFIER_THRESH=%f' % class_thresh, 'OUTPUT_CLASS=%d' % output_class])
def _run(self, fn): in_path = self.data_path + fn out_path = self.data_path + 'out-' + fn cmd = 'hadoop fs -put %s %s' % (fn, in_path) subprocess.check_call(cmd.split()) # Don't let the file split, CDH3 has a bug and will try to split gz's hadoopy.launch_frozen(in_path, out_path, 'wc.py', jobconfs='mapred.min.split.size=100000000') wc = dict(hadoopy.cat(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat', None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname, matext = os.path.splitext(mat) gopts.getintkey('blocksize', 3) schedule = gopts.getstrkey('reduce_schedule', '1') # clear the output output = args.get('output', '%s-qrr%s' % (matname, matext)) if hadoopy.exists(output): print "Removing %s" % (output) hadoopy.rm(output) outputnamefunc = lambda x: output + "_iter%i" % (x) steps = schedule.split(',') jobconfs = [] # determine the split size if 'split_size' in args: splitsize = args['split_size'] jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' + str(splitsize)) for i, step in enumerate(steps): if i > 0: input = curoutput mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper' else: mapper = True # use the command line mapper if i + 1 == len(steps): curoutput = output else: curoutput = output + "_iter%i" % (i + 1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter', i) if launch: hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step), jobconfs=jobconfs)
def gen_data(num_clusters, num_points, num_dims): hadoopy.launch_frozen(in_name='/tmp/bwhite/input/synth_clusters/dummy', out_name='/tmp/bwhite/input/synth_clusters/%d-%d-%d' % (num_clusters, num_points, num_dims), script_path='generate_data.py', remove_dir=True, cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters), 'NUM_POINTS=%d' % (num_points), 'NUM_DIMS=%d' % (num_dims)], #reducer=None, jobconfs='mapred.reduce.tasks=30', frozen_path='frozen')
def _run_haystack(fn, script_name): cur_time = time.time() hdfs_base_path = "hadoopy-test-data/%f/" % cur_time print("Storing HDFS temp files and output in [%s]" % hdfs_base_path) in_path = hdfs_base_path + os.path.basename(fn) out_path = hdfs_base_path + "out-" + os.path.basename(fn) hadoopy.put(fn, in_path) print("Launching job [%s]" % script_name) hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + "target.jpg"]) print("Storing local output in [%s]" % local_out) for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)): open("%s%s-img%.8d-%s.jpg" % (local_out, script_name, num, image_name), "w").write(image_data)
def launch_frozen(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw): _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw) hadoopy.launch_frozen(in_name, out_name, script_path, **kw)
def run_predict_classifier(hdfs_input, hdfs_classifier_input, hdfs_output, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz') print('------------------------BEFORE READTB') file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name) print('------------------------AFTER READTB [%s, %s]' % (fp.name, os.path.exists(fp.name))) files.append(fp.name) hadoopy.launch_frozen(hdfs_input, hdfs_output, 'predict_classifier.py', files=files, reducer=None, cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)], dummy_arg=fp)
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat', None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname, matext = os.path.splitext(mat) gopts.getintkey('blocksize', 3) schedule = gopts.getstrkey('reduce_schedule', '1') # clear the output output = args.get('output', '%s-normal%s' % (matname, matext)) if hadoopy.exists(output): print "Removing %s" % (output) hadoopy.rm(output) outputnamefunc = lambda x: output + "_iter%i" % (x) steps = schedule.split(',') for i, step in enumerate(steps): if i > 0: input = curoutput if i + 1 == len(steps): curoutput = output else: curoutput = output + "_iter%i" % (i + 1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter', i) if launch: if i > 0: mapper = "org.apache.hadoop.mapred.lib.IdentityMapper" hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step)) else: hadoopy.launch_frozen(input, curoutput, __file__, cmdenvs=gopts.cmdenv(), num_reducers=int(step))
def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num): image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type]) try: shutil.rmtree(output_path) except OSError: pass os.makedirs(output_path) for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num): open(output_path + '/%s' % (x,), 'w').write(y)
def run_video_keyframe(hdfs_input, hdfs_output, min_resolution, max_resolution, ffmpeg, **kw): if not ffmpeg: hadoopy.launch_frozen(hdfs_input, hdfs_output, 'video_keyframe.py', reducer=None, cmdenvs=['MIN_RESOLUTION=%d' % min_resolution, 'MAX_RESOLUTION=%f' % max_resolution]) else: fp = vidfeat.freeze_ffmpeg() hadoopy.launch_frozen(hdfs_input, hdfs_output, 'video_keyframe.py', reducer=None, cmdenvs=['MIN_RESOLUTION=%d' % min_resolution, 'MAX_RESOLUTION=%f' % max_resolution], files=fp.__enter__(), dummy_arg=fp)
def _run_face(self, fn, out_path, **kw): bfn = os.path.basename(fn) in_path = self.data_path + bfn hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): labels = {} try: labels = file_parse.load(local_labels) except IOError: pass hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' hadoopy.launch_frozen(hdfs_input_pos, hdfs_output_pos, 'collect_keys.py') hadoopy.launch_frozen(hdfs_input_neg, hdfs_output_neg, 'collect_keys.py') pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys}, 'classifier': classifier, 'classifier_extra': classifier_extra} file_parse.dump(labels, local_labels)
def main(): dense_path = 'exemplarbank/output/1341790878.92/pos' image_path = 'exemplarbank/data/sun_labelme_person/1-tr' image_box_fns = {} id_box_features = dict(hash_features(dense_path)) print id_box_features.items()[0] for (image_id, box), feature in id_box_features.items(): image_box_fns.setdefault(image_id, []).append((box, (image_id, box))) with open('image_box_fns.pkl', 'w') as fp: pickle.dump(image_box_fns, fp, -1) with hadoopy_helper.hdfs_temp() as hdfs_output: hadoopy.launch_frozen(image_path, hdfs_output, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature']) id_box_features2 = dict(hash_features(hdfs_output)) with open('compare.pkl', 'w') as fp: pickle.dump((id_box_features, id_box_features2), fp, -1)
def _run_haystack(fn, script_name): cur_time = time.time() hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time print('Storing HDFS temp files and output in [%s]' % hdfs_base_path) in_path = hdfs_base_path + os.path.basename(fn) out_path = hdfs_base_path + 'out-' + os.path.basename(fn) hadoopy.put(fn, in_path) print('Launching job [%s]' % script_name) hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + 'target.jpg']) print('Storing local output in [%s]' % local_out) for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)): open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name), 'w').write(image_data)
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat',None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname,matext = os.path.splitext(mat) gopts.getintkey('blocksize',3) schedule = gopts.getstrkey('reduce_schedule','1') # clear the output output = args.get('output','%s-normal%s'%(matname,matext)) if hadoopy.exists(output): print "Removing %s"%(output) hadoopy.rm(output) outputnamefunc = lambda x: output+"_iter%i"%(x) steps = schedule.split(',') for i,step in enumerate(steps): if i>0: input = curoutput if i+1==len(steps): curoutput = output else: curoutput = output+"_iter%i"%(i+1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter',i) if launch: if i>0: mapper="org.apache.hadoop.mapred.lib.IdentityMapper" hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step)) else: hadoopy.launch_frozen(input, curoutput, __file__, cmdenvs=gopts.cmdenv(), num_reducers=int(step))
def run_join_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, local_image_output, **kw): inputs = [hdfs_predictions_input] if isinstance(hdfs_input, list): inputs += hdfs_input else: inputs.append(hdfs_input) hadoopy.launch_frozen(inputs, hdfs_output, 'join_predictions.py') if local_image_output: for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_output): for classifier, preds in classifier_preds.items(): for conf, label in preds: path = '%s/%s/label_%d/%8.8f-%s.jpg' % (local_image_output, classifier, label, conf, image_hash) try: os.makedirs(os.path.dirname(path)) except OSError: pass with open(path, 'w') as fp: fp.write(image_data)
def _run_face(fn): cur_time = time.time() hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time print('Storing HDFS temp files and output in [%s]' % hdfs_base_path) in_path = hdfs_base_path + os.path.basename(fn) out_path = hdfs_base_path + 'out-' + os.path.basename(fn) cmd = 'hadoop fs -put %s %s' % (fn, in_path) subprocess.check_call(cmd.split()) hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', files=[data_path + 'haarcascade_frontalface_default.xml']) local_out = 'out-%f' % cur_time try: os.makedirs(local_out) except OSError: pass print('Storing local output in [%s]' % local_out) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): image = np.asarray(Image.open(StringIO.StringIO(image_data))) for (x, y, w, h), n in faces: cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 3) cv2.imwrite('%s/img%.8d.jpg' % (local_out, num), image[:, :, ::-1].copy())
def compute_db_hadoop(self, hdfs_path): import json si = picarus.api.SearchIndex() si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) si.feature = json.dumps( self.feature_dict) # TODO: What to do with the pkl file? with hadoopy_helper.hdfs_temp() as hdfs_output: picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side) # Compute features (map) picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files) # Random sample features for hashes (map) and train hasher (reduce) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={ 'KV_PROB': 1., 'HASH_BITS': 128 }) hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1] si.hash = pickle.dumps(hasher, -1) si.hash_format = si.PICKLE # Compute features hashes (map) and build database (reduce) open('hasher.pkl', 'w').write(si.hash) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl']) metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next() self.metadata = metadata si.metadata.extend(metadata.tolist()) self.index = image_search.LinearHashDB().store_hashes( hashes, np.arange(len(metadata), dtype=np.uint64)) si.index = pickle.dumps(self.index, -1) si.index_format = si.PICKLE open('index.pb', 'w').write(si.SerializeToString())
def initial_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True) # Compute desired probability num_val = 5000 num_neg_train = 5000 toggle_launch() if 0: neg_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train)) neg_samples = [x[1] for x in neg_samples] with open('neg_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[num_val:]), fp, -1) with open('neg_val_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[:num_val]), fp, -1) del neg_samples gc.collect() pos_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'pos', num_val / 2)) # Twice as many neg as positive pos_samples = [x[1] for x in pos_samples] with open('pos_val_feats.pkl', 'w') as fp: pickle.dump(np.array(pos_samples), fp, -1) del pos_samples gc.collect() cmdenvs = {'NEG_FEATS': 'neg_feats.pkl', 'POS_VAL_FEATS': 'pos_val_feats.pkl', 'NEG_VAL_FEATS': 'neg_val_feats.pkl'} files = cmdenvs.values() cmdenvs['SAMPLE_SIZE'] = 1000 hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py', cmdenvs=cmdenvs, remove_output=True, files=files) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def calibrate(hdfs_input, hdfs_output): # Predict on pos/neg sets hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) # Calibrate threshold using pos/neg validation set #1 hadoopy.launch_frozen([ hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1' ], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def run_kmeans(hdfs_input, hdfs_prev_clusters, hdfs_image_data, hdfs_output, num_clusters, num_iters, num_samples, metric, local_json_output=None, **kw): frozen_tar_path = None for cur_iter_num in range(num_iters): clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters) clusters_fn = os.path.basename(clusters_fp.name) cur_output = '%s/clust%.6d' % (hdfs_output, cur_iter_num) frozen_tar_path = hadoopy.launch_frozen(hdfs_input, cur_output, 'kmeans.py', cmdenvs=['CLUSTERS_FN=%s' % clusters_fn], files=[clusters_fp.name], num_reducers=max(1, num_clusters / 2), frozen_tar_path=frozen_tar_path, dummy_arg=clusters_fp)['frozen_tar_path'] hdfs_prev_clusters = cur_output print('Clusters[%s]' % hdfs_prev_clusters) # Compute K-Means assignment/samples # TODO Do full assignment, then sample clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters) clusters_fn = os.path.basename(clusters_fp.name) cur_output = '%s/assign' % hdfs_output hadoopy.launch_frozen(hdfs_input, cur_output, 'kmeans_assign.py', cmdenvs=['CLUSTERS_FN=%s' % clusters_fn, 'NUM_SAMPLES=%d' % num_samples, 'mapred.text.key.partitioner.options=-k1'], files=[clusters_fp.name], num_reducers=max(1, num_clusters / 2), partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner', dummy_arg=clusters_fp) print('Assignment[%s]' % cur_output) # Filter the samples assignments_fp = fetch_assignments_from_hdfs(cur_output) assignments_fn = os.path.basename(assignments_fp.name) cur_output = '%s/samples' % hdfs_output hadoopy.launch_frozen(hdfs_image_data, cur_output, 'filter_samples.py', cmdenvs=['ASSIGNMENTS_FN=%s' % os.path.basename(assignments_fn)], files=[assignments_fp.name], reducer=None, dummy_arg=assignments_fp) print('Samples[%s]' % cur_output)
def run_face_ranker(hdfs_input, hdfs_output, feature_pkl, exemplar_fn): """ Runs the face_ranker.py hadoopy script. The output consists of the distance of each image to an exemplar as key, and the input tuple of (key, imagedata) as value. Inputs: - hdfs_input: path to hdfs input: (key, imagedata) pairs - hdfs_output: path to the hdfs output tuples: (dist, (key, imagedata)) where dist is the distance in Eigenfaces feature space to the exemplar image - feature_pkl: pickle file containing a trained Eigenfaces feature - exemplar_fn: filename of the exemplar image """ fp = tempfile.NamedTemporaryFile() _compute_exemplar_feature(exemplar_fn, feature_pkl, fp) fp.flush() hadoopy.launch_frozen(hdfs_input, hdfs_output, 'face_ranker.py', cmdenvs=['EXEMPLAR_FN=%s' % os.path.basename(fp.name), 'FEATURE_FN=%s' % os.path.basename(feature_pkl)], files=[feature_pkl, fp.name])
def main(input_path, output_path, num_clusters, cluster_path, num_reducers): def inc_path(): global iter_cnt iter_cnt +=1 return '%s/%d' % (output_path, iter_cnt) def prev_path(): return '%s/%d' % (output_path, iter_cnt) consolidate_clusters(cluster_path, 'clusters.pkl') if 1: hadoopy.launch_frozen(in_name=input_path, out_name=inc_path(), script_path='kmeans_cluster_single.py', reducer=None, cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'), 'NN_MODULE=nn_l2sqr_c'], #combiner=True, files=['nn_l2sqr_c.py','clusters.pkl'], shared_libs=SHARED_LIBS, modules=['vitrieve_algorithms', 'nn_l2sqr_c',], remove_dir=True, jobconfs=['mapred.min.split.size=999999999999', 'mapred.reduce.tasks=%d' % (num_reducers)])
def calibrate(hdfs_input, hdfs_output): # Predict on pos/neg sets hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl']) # Calibrate threshold using pos/neg validation set #1 hadoopy.launch_frozen([hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1'], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def flickr_images(tags, images_per_tag, hdfs_output, num_files=20, max_iters=1, max_pages=1, output_meta=False, api_key=None, api_secret=None, remove_output=False): tags = list(tags) if api_key is None or api_secret is None: api_key = os.environ['FLICKR_API_KEY'] api_secret = os.environ['FLICKR_API_SECRET'] tags_per_chunk = max(len(tags) / num_files, 1) if remove_output and hadoopy.exists(hdfs_output): print('Removing output dir[%s]' % hdfs_output) hadoopy.rmr(hdfs_output) cmdenvs = { 'FLICKR_API_KEY': api_key, 'FLICKR_API_SECRET': api_secret, 'MAX_ITERS': str(max_iters), 'MAX_PAGES': str(max_pages) } for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)): hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num, [(images_per_tag, tag) for tag in chunk_tags]) hadoopy.launch_frozen(hdfs_output + '/tags', hdfs_output + '/metadata', _lf('flickr_bulk.py'), cmdenvs=cmdenvs, num_reducers=num_files) output_type = 'meta' if output_meta else 'image' hadoopy.launch_frozen(hdfs_output + '/metadata', hdfs_output + '/image_metadata', _lf('file_downloader.py'), cmdenvs={'OUTPUT_TYPE': output_type})
def compute_database(flickr_data): r = 'image_search/%f/' % time.time() f_path = r + 'features/' m_path = r + 'median/' h_path = r + 'hashes/' j_path = r + 'hash_metadata/' hadoopy.launch_frozen(flickr_data, f_path, 'build_features.py') hadoopy.launch_frozen(f_path, m_path, 'calc_median_feature.py') median = np.array([x for _, x in sorted(hadoopy.readtb(m_path))]) pickle.dump(median, open('median.pkl', 'w'), -1) hadoopy.launch_frozen(f_path, h_path, 'compute_hashes.py', files=['median.pkl']) hadoopy.launch_frozen([h_path, flickr_data], j_path, 'join.py', num_reducers=10) hashes, metadatas = zip(*[x[1] for x in hadoopy.readtb(j_path)]) hashes = np.array([x.ravel() for x in hashes]) with open('database.pkl', 'w') as fp: pickle.dump((hashes, metadatas, median), fp, -1)
def run_face_ranker(hdfs_input, hdfs_output, feature_pkl, exemplar_fn): """ Runs the face_ranker.py hadoopy script. The output consists of the distance of each image to an exemplar as key, and the input tuple of (key, imagedata) as value. Inputs: - hdfs_input: path to hdfs input: (key, imagedata) pairs - hdfs_output: path to the hdfs output tuples: (dist, (key, imagedata)) where dist is the distance in Eigenfaces feature space to the exemplar image - feature_pkl: pickle file containing a trained Eigenfaces feature - exemplar_fn: filename of the exemplar image """ fp = tempfile.NamedTemporaryFile() _compute_exemplar_feature(exemplar_fn, feature_pkl, fp) fp.flush() hadoopy.launch_frozen(hdfs_input, hdfs_output, 'face_ranker.py', cmdenvs=[ 'EXEMPLAR_FN=%s' % os.path.basename(fp.name), 'FEATURE_FN=%s' % os.path.basename(feature_pkl) ], files=[feature_pkl, fp.name])
def exemplar_boxes(hdfs_input, hdfs_output): exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]' st = time.time() exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/' for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'): (image_id, box, score), _ = kv if exemplar_name == '%s-%s' % (image_id, box): print('Found it') with open('exemplars-patch.pkl', 'w') as fp: pickle.dump([kv], fp, -1) hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'], num_reducers=10) hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'], num_reducers=10) with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])] neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])] for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)): image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'}) out_dir = 'exemplars_similar_cropped/' try: shutil.rmtree('exemplars_similar_cropped') except OSError: pass print('Outputting cropped') os.makedirs(out_dir) print(exemplar_out + 'boxes_cropped') for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'): open(out_dir + x, 'w').write(y) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'}) out_dir = 'exemplars_similar/' try: shutil.rmtree('exemplars_similar') except OSError: pass print('Outputting boxes') os.makedirs(out_dir) for x, y in hadoopy.readtb(exemplar_out + 'boxes'): open(out_dir + x, 'w').write(y)
def initial_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True) hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True) # Compute desired probability num_val = 5000 num_neg_train = 5000 toggle_launch() if 0: neg_samples = list( hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train)) neg_samples = [x[1] for x in neg_samples] with open('neg_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[num_val:]), fp, -1) with open('neg_val_feats.pkl', 'w') as fp: pickle.dump(np.array(neg_samples[:num_val]), fp, -1) del neg_samples gc.collect() pos_samples = list( hadoopy_helper.jobs.random_sample( hdfs_output + 'pos', num_val / 2)) # Twice as many neg as positive pos_samples = [x[1] for x in pos_samples] with open('pos_val_feats.pkl', 'w') as fp: pickle.dump(np.array(pos_samples), fp, -1) del pos_samples gc.collect() cmdenvs = { 'NEG_FEATS': 'neg_feats.pkl', 'POS_VAL_FEATS': 'pos_val_feats.pkl', 'NEG_VAL_FEATS': 'neg_val_feats.pkl' } files = cmdenvs.values() cmdenvs['SAMPLE_SIZE'] = 1000 hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py', cmdenvs=cmdenvs, remove_output=True, files=files) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def _launch_frozen(in_path, out_path, script_path, jobconfs_default=(), *args, **kw): import hadoopy import os kw = dict(kw) # Make a copy as we will be mutating it kw['frozen_tar_path'] = _freeze_script(script_path)['frozen_tar_path'] if 'reducer' not in kw and 'num_reducers' not in kw: kw['num_reducers'] = 1 if 'jobconfs' in kw: kw['jobconfs'] = kw['jobconfs'] + GLOBAL_JOBCONFS else: kw['jobconfs'] = GLOBAL_JOBCONFS if 'jobconfs' not in kw: kw['jobconfs'] = [] if jobconfs_default: jobconfs_dict = dict(x.split('=', 1) for x in kw['jobconfs']) jobconfs_default_dict = dict(x.split('=', 1) for x in jobconfs_default) for jobconf_name, jobconf_value in jobconfs_default_dict.items(): if jobconf_name not in jobconfs_dict: jobconfs_dict[jobconf_name] = jobconf_value kw['jobconfs'] = ['%s=%s' % x for x in jobconfs_dict.items()] if 'image_hashes' in kw and kw['image_hashes'] is not None: import tempfile fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz') file_parse.dump(kw['image_hashes'], fp.name) try: kw['files'].append(fp.name) except KeyError: kw['files'] = [fp.name] try: kw['cmdenvs'].append('PICARUS_VALID_IMAGE_HASHES=%s' % os.path.basename(fp.name)) except KeyError: kw['cmdenvs'] = ['PICARUS_VALID_IMAGE_HASHES=%s' % os.path.basename(fp.name)] kw['_internal_dummy_arg'] = fp # Keep the object alive del kw['image_hashes'] return hadoopy.launch_frozen(in_path, out_path, script_path, *args, **kw)
def hard_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg', 'hard_predictions.py', cmdenvs=[ 'EXEMPLARS=exemplars.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box' ], num_reducers=10, files=['exemplars.pkl'], remove_output=True) def _inner(): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'): for score2, image_id2, box2 in negs: image_box_fns.setdefault(image_id2, []).append( (box2, [image_id, box, score])) pickle.dump(image_box_fns, fp, -1) del image_box_fns gc.collect() _inner() hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg_clip', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature']) hadoopy.launch_frozen( [hdfs_output + 'pos_sample', hdfs_output + 'hard_neg_clip'], hdfs_output + 'exemplars-1', 'train_exemplars_hard.py', cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'], files=['neg_feats.pkl'], remove_output=True, num_reducers=10) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
def hard_train(hdfs_input, hdfs_output): hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], num_reducers=10, files=['exemplars.pkl'], remove_output=True) def _inner(): with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'): for score2, image_id2, box2 in negs: image_box_fns.setdefault(image_id2, []).append((box2, [image_id, box, score])) pickle.dump(image_box_fns, fp, -1) del image_box_fns gc.collect() _inner() hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg_clip', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature']) hadoopy.launch_frozen([hdfs_output + 'pos_sample', hdfs_output + 'hard_neg_clip'], hdfs_output + 'exemplars-1', 'train_exemplars_hard.py', cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'], files=['neg_feats.pkl'], remove_output=True, num_reducers=10) exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'), key=lambda x: x[0]) with open('exemplars.pkl', 'w') as fp: pickle.dump(exemplar_out, fp, -1)
import hadoopy import random for i in range(5): prefix = str(random.random()) print(prefix) hadoopy.launch_frozen('/tmp/bwhite/input/pets2006.video_frame_data.tb', '/tmp/bwhite/output/pets2006.video_frame_data.b/' + prefix, 'bgsub.py', partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner', jobconfs=['mapred.text.key.partitioner.options=-k1,1', #'mapred.reduce.tasks=500', 'mapred.min.split.size=999999999999' 'mapred.reduce.tasks=1', 'mapred.output.compress=true', 'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'], shared_libs=['libbgsub_fast.so'], frozen_path='frozen')
import hadoopy import time # Setup paths data_path = 'hadoopy-test-data/%f/' % time.time() input_path = data_path + 'wc-input' output_path = data_path + 'wc-output' # Write data to HDFS in the form of (term #, term) input_data = enumerate( 'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.' .split()) hadoopy.writetb(input_path, input_data) # Launch the job hadoopy.launch_frozen(input_path, output_path, 'wc.py') # Read the first KV pair word_counts = dict(hadoopy.readtb(output_path)) for probe_word, expected_count in [('the', 6), ('Lorem', 4), ('of', 4)]: print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]
def test_cluster_info(self): hadoopy.writetb(self.data_path + 'cluster_info_input', [(0, 0)]) hadoopy.launch_frozen(self.data_path + 'cluster_info_input', self.data_path + 'cluster_info', 'cluster_info.py') pprint.pprint(dict(hadoopy.readtb(self.data_path + 'cluster_info')))
import time # Setup paths data_path = 'hadoopy-test-data/%f/' % time.time() input_path = data_path + 'input' output_path_a = data_path + 'output_a' output_path_b = data_path + 'output_b' output_path_c = data_path + 'output_c' output_path_d = data_path + 'output_d' # Write data to HDFS in the form of (term #, term) input_data = [(1, 5), ('dsfs', { 'a': 3 }), ([1, 2], 'sdflk')] # Diverse KV input hadoopy.writetb(input_path, input_data) # Launch the jobs hadoopy.launch_frozen(input_path, output_path_a, 'identity.py') hadoopy.launch_frozen(input_path, output_path_b, 'identity.py') hadoopy.launch_frozen(output_path_b, output_path_c, 'identity.py') hadoopy.launch_frozen( [input_path, output_path_a, output_path_b, output_path_c], output_path_d, 'identity.py') # Read the first KV pair print('KV Input[%s]' % str(hadoopy.readtb(input_path).next())) print('KV Output a[%s]' % str(hadoopy.readtb(output_path_a).next())) print('KV Output b[%s]' % str(hadoopy.readtb(output_path_b).next())) print('KV Output c[%s]' % str(hadoopy.readtb(output_path_c).next())) print('KV Output d[%s]' % str(hadoopy.readtb(output_path_d).next()))
from hadoopy import launch_frozen input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams' output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy-frozen' launch_frozen( input_path, output_path, 'ngrams.py', use_seqoutput=False, num_reducers=10, hstreaming= '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar' )
def exemplar_boxes(hdfs_input, hdfs_output): exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]' st = time.time() exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/' for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'): (image_id, box, score), _ = kv if exemplar_name == '%s-%s' % (image_id, box): print('Found it') with open('exemplars-patch.pkl', 'w') as fp: pickle.dump([kv], fp, -1) hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=[ 'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box' ], files=['exemplars-patch.pkl'], num_reducers=10) hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=[ 'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box' ], files=['exemplars-patch.pkl'], num_reducers=10) with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted( hadoopy.readtb(exemplar_out + 'val_pos').next()[1])] neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted( hadoopy.readtb(exemplar_out + 'val_neg').next()[1])] for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)): image_box_fns.setdefault(image_id, []).append( (box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'}) out_dir = 'exemplars_similar_cropped/' try: shutil.rmtree('exemplars_similar_cropped') except OSError: pass print('Outputting cropped') os.makedirs(out_dir) print(exemplar_out + 'boxes_cropped') for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'): open(out_dir + x, 'w').write(y) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'}) out_dir = 'exemplars_similar/' try: shutil.rmtree('exemplars_similar') except OSError: pass print('Outputting boxes') os.makedirs(out_dir) for x, y in hadoopy.readtb(exemplar_out + 'boxes'): open(out_dir + x, 'w').write(y)
#hadoopy.launch_local(r + 'out/flickr_metadata', r + 'out/flickr_images', 'file_downloader.py', # worker_queue_maxsize=10) # , max_input=100 print('Downloaded images') import glob import shutil import cv2 for fn in glob.glob('*.JPG'): img = cv2.imread(fn) img = cv2.resize(img, (int(img.shape[1] / 2.5), int(img.shape[0] / 2.5))) try: os.remove('target.jpg') except OSError: pass cv2.imwrite('target.jpg', img) hadoopy.launch_frozen([ 'flickr_data_picarus/run-1343747418.029870/out/down', 'flickr_data_picarus/run-1343712226.822338/out/flickr_images' ], r + 'tiles', 'picnic_job.py', files=['target.jpg'], remove_output=True) base = os.path.basename(fn) + '_tiles/' try: os.makedirs(base) except OSError: pass for k, v in hadoopy.readtb(r + 'tiles'): with open(base + k, 'w') as fp: fp.write(v)