def run_compute_kernels(hdfs_input, hdfs_output, local_labels_x, local_labels_y, cols_per_chunk=1000, rows_per_chunk=100000, **kw): if local_labels_y is None or local_labels_x is None: raise ValueError('local_labels_* must not be None!') cmdenvs = [ 'LOCAL_LABELS_FN_Y=%s' % os.path.basename(local_labels_y), 'ROWS_PER_CHUNK=%d' % rows_per_chunk, 'COLS_PER_CHUNK=%d' % cols_per_chunk ] files = [local_labels_y] cmdenvs.append('LOCAL_LABELS_FN_X=%s' % os.path.basename(local_labels_x)) files.append(local_labels_x) picarus._launch_frozen( hdfs_input, hdfs_output, _lf('compute_kernels.py'), cmdenvs=cmdenvs, partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner', jobconfs_default=[ 'mapred.task.timeout=6000000', 'mapred.text.key.partitioner.options=-k1,2' ], files=files, **kw)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): """ TODO Finish docstring Args: hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local. """ labels = {} try: labels = file_parse.load(local_labels) except IOError: pass if hdfs_output is None: j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in j['output']), []) j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py')) neg_keys = sum((x[1] for x in j['output']), []) else: hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py')) picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys}, 'classifier': classifier, 'classifier_extra': classifier_extra} file_parse.dump(labels, local_labels)
def run_image_clean(hdfs_input, hdfs_output, max_side=None, filter_side=None, **kw): cmdenvs = {} if max_side is not None: cmdenvs['MAX_SIDE'] = max_side if filter_side is not None: cmdenvs['FILTER_SIDE'] = filter_side picarus._launch_frozen(hdfs_input, hdfs_output, _lf('image_clean.py'), cmdenvs=cmdenvs, **kw)
def make_thumbnails(hdfs_input, hdfs_output, thumb_size, image_type, **kw): script = 'make_thumbnails.py' picarus._launch_frozen( hdfs_input, hdfs_output, _lf(script), cmdenvs=['THUMB_SIZE=%d' % thumb_size, 'IMAGE_TYPE=%s' % image_type])
def run_face_finder(hdfs_input, hdfs_output, image_length, boxes, image_hashes=None, **kw): cmdenvs = ['IMAGE_LENGTH=%d' % image_length] if boxes: cmdenvs.append('OUTPUT_BOXES=True') picarus._launch_frozen(hdfs_input, hdfs_output, _lf('face_finder.py'), reducer=None, cmdenvs=cmdenvs, files=[_lf('data/haarcascade_frontalface_default.xml')], image_hashes=image_hashes)
def run_video_features(hdfs_input, hdfs_output, **kw): fp = viderator.freeze_ffmpeg() picarus._launch_frozen(hdfs_input, hdfs_output + '/features', _lf('video_combined_features.py'), cmdenvs=[], jobconfs=['mapred.child.java.opts=-Xmx512M', 'mapred.task.timeout=12000000', 'mapred.map.max.attempts=10'], files=[fp.__enter__(), _lf('data/haarcascade_frontalface_default.xml')], dummy_arg=fp)
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") files.append(local_labels) picarus._launch_frozen(hdfs_input, hdfs_output, _lf('train_classifier.py'), files=files, cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)], jobconfs_default=['mapred.task.timeout=6000000'], **kw)
def run_image_feature_point(hdfs_input, hdfs_output, feature, image_length=None, image_height=None, image_width=None, **kw): if image_length: image_height = image_width = image_length if image_height is None or image_width is None: raise ValueError('Please specify image_height/image_width or image_length') picarus._launch_frozen(hdfs_input, hdfs_output, _lf('feature_point_compute.py'), cmdenvs=['IMAGE_HEIGHT=%d' % image_height, 'IMAGE_WIDTH=%d' % image_width, 'FEATURE=%s' % feature], files=[_lf('data/eigenfaces_lfw_cropped.pkl')] + glob.glob(imfeat.__path__[0] + "/_object_bank/data/*"))
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") files.append(local_labels) picarus._launch_frozen( hdfs_input, hdfs_output, _lf('train_classifier.py'), files=files, cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)], jobconfs_default=['mapred.task.timeout=6000000'], **kw)
def run_predict_classifier(hdfs_input, hdfs_classifier_input, hdfs_output, classes=None, image_hashes=None, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz') file_parse.dump([x for x in hadoopy.readtb(hdfs_classifier_input) if classes is None or x[0] in classes], fp.name) files.append(fp.name) picarus._launch_frozen(hdfs_input, hdfs_output, _lf('predict_classifier.py'), files=files, reducer=None, cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)], image_hashes=image_hashes, dummy_arg=fp)
def run_video_keyframe(hdfs_input, hdfs_output, frame_skip=1, min_interval=5, max_interval=float('inf'), max_time=float('inf'), keyframer='uniform', **kw): fp = viderator.freeze_ffmpeg() picarus._launch_frozen(hdfs_input, hdfs_output + '/keyframe', _lf('video_keyframe.py'), cmdenvs=['MIN_INTERVAL=%f' % min_interval, 'MAX_INTERVAL=%f' % max_interval, 'FRAME_SKIP=%d' % frame_skip, 'KEYFRAMER=%s' % keyframer, 'MAX_TIME=%f' % max_time], jobconfs=['mapred.child.java.opts=-Xmx768M', 'mapred.task.timeout=12000000', 'mapred.map.max.attempts=10'], files=[fp.__enter__()], dummy_arg=fp)
def run_compute_kernels(hdfs_input, hdfs_output, local_labels_x, local_labels_y, cols_per_chunk=1000, rows_per_chunk=100000, **kw): if local_labels_y is None or local_labels_x is None: raise ValueError('local_labels_* must not be None!') cmdenvs = ['LOCAL_LABELS_FN_Y=%s' % os.path.basename(local_labels_y), 'ROWS_PER_CHUNK=%d' % rows_per_chunk, 'COLS_PER_CHUNK=%d' % cols_per_chunk] files = [local_labels_y] cmdenvs.append('LOCAL_LABELS_FN_X=%s' % os.path.basename(local_labels_x)) files.append(local_labels_x) picarus._launch_frozen(hdfs_input, hdfs_output, _lf('compute_kernels.py'), cmdenvs=cmdenvs, partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner', jobconfs_default=['mapred.task.timeout=6000000', 'mapred.text.key.partitioner.options=-k1,2'], files=files, **kw)
def run_predict_windows(hdfs_input, hdfs_classifier_input, feature, hdfs_output, image_height, image_width, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz') file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name) files.append(fp.name) files.append(_lf('data/haarcascade_frontalface_default.xml')) cmdenvs = ['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)] cmdenvs += ['IMAGE_HEIGHT=%d' % image_height, 'IMAGE_WIDTH=%d' % image_width, 'FEATURE=%s' % feature] picarus._launch_frozen(hdfs_input, hdfs_output, _lf('predict_windows.py'), cmdenvs=cmdenvs, files=files, dummy_arg=fp)
def run_join_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, local_image_output, **kw): inputs = [hdfs_predictions_input] if isinstance(hdfs_input, list): inputs += hdfs_input else: inputs.append(hdfs_input) picarus._launch_frozen(inputs, hdfs_output, _lf('join_predictions.py')) if local_image_output: for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_output): for classifier, preds in classifier_preds.items(): for conf, label in preds: path = '%s/%s/label_%d/%8.8f-%s.jpg' % (local_image_output, classifier, label, conf, image_hash) try: os.makedirs(os.path.dirname(path)) except OSError: pass with open(path, 'w') as fp: fp.write(image_data)
def run_image_feature(hdfs_input, hdfs_output, feature, files=(), **kw): files = list(files) if isinstance(feature, dict): feature = zlib.compress(json.dumps(feature), 9) feature_fp = tempfile.NamedTemporaryFile() feature_fp.write(feature) feature_fp.flush() # This allows for replacing the default models cur_files = set([os.path.basename(x) for x in files]) for x in [_lf('data/hog_8_2_clusters.pkl'), _lf('data/eigenfaces_lfw_cropped.pkl')] + glob.glob(imfeat.__path__[0] + "/_object_bank/data/*"): if os.path.basename(x) not in cur_files: files.append(x) cur_files.add(x) files.append(feature_fp.name) picarus._launch_frozen(hdfs_input, hdfs_output, _lf('feature_compute.py'), cmdenvs=['FEATURE=%s' % os.path.basename(feature_fp.name)], files=files, dummy_arg=feature_fp, **kw)
def run_video_grep_frames(hdfs_input, hdfs_output, feature, max_frames_per_video=None, max_outputs_per_video=None, output_frame=True, **kw): fp = viderator.freeze_ffmpeg() feature_fp = tempfile.NamedTemporaryFile(suffix='.pkl') pickle.dump(feature, feature_fp, -1) feature_fp.flush() cmdenvs = ['FEATURE_FN=%s' % os.path.basename(feature_fp.name)] if max_frames_per_video is not None: cmdenvs.append('MAX_FRAMES_PER_VIDEO=%d' % (max_frames_per_video)) if max_outputs_per_video is not None: cmdenvs.append('MAX_OUTPUTS_PER_VIDEO=%d' % (max_outputs_per_video)) cmdenvs.append('OUTPUT_FRAME=%d' % int(output_frame)) picarus._launch_frozen(hdfs_input, hdfs_output, _lf('video_grep_frames.py'), cmdenvs=cmdenvs, jobconfs=['mapred.child.java.opts=-Xmx512M', 'mapred.task.timeout=12000000', 'mapred.map.max.attempts=10'], files=[fp.__enter__(), feature_fp.name], dummy_arg=(fp, feature_fp))
def run_video_predicate_frames(hdfs_input, hdfs_output, features, max_frames_per_video=None, **kw): fp = viderator.freeze_ffmpeg() features_fp = tempfile.NamedTemporaryFile(suffix='.pkl') pickle.dump(features, features_fp, -1) features_fp.flush() cmdenvs = ['FEATURES_FN=%s' % os.path.basename(features_fp.name)] if max_frames_per_video is not None: cmdenvs.append('MAX_FRAMES_PER_VIDEO=%d' % (max_frames_per_video)) picarus._launch_frozen(hdfs_input, hdfs_output + '/predicate_frames', _lf('video_predicate_frames.py'), cmdenvs=cmdenvs, jobconfs=['mapred.child.java.opts=-Xmx768M', 'mapred.skip.attempts.to.start.skipping=2', 'mapred.skip.map.max.skip.records=1', 'mapred.skip.mode.enabled=true', 'mapred.skip.reduce.auto.incr.proc.count=false' 'mapred.skip.map.auto.incr.proc.count=false', 'mapred.task.timeout=12000000', 'mapred.map.max.attempts=10'], files=[fp.__enter__(), features_fp.name], dummy_arg=(fp, features_fp))
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): """ TODO Finish docstring Args: hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local. """ labels = {} try: labels = file_parse.load(local_labels) except IOError: pass if hdfs_output is None: j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in j['output']), []) j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py')) neg_keys = sum((x[1] for x in j['output']), []) else: hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py')) picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = { 'labels': { '1': pos_keys, '-1': neg_keys }, 'classifier': classifier, 'classifier_extra': classifier_extra } file_parse.dump(labels, local_labels)
def run_predict_classifier(hdfs_input, hdfs_classifier_input, hdfs_output, classes=None, image_hashes=None, **kw): import classipy # NOTE: Adds necessary files files = glob.glob(classipy.__path__[0] + "/lib/*") fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz') file_parse.dump([ x for x in hadoopy.readtb(hdfs_classifier_input) if classes is None or x[0] in classes ], fp.name) files.append(fp.name) picarus._launch_frozen( hdfs_input, hdfs_output, _lf('predict_classifier.py'), files=files, reducer=None, cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)], image_hashes=image_hashes, dummy_arg=fp)
def run_kmeans(hdfs_input, hdfs_prev_clusters, hdfs_image_data, hdfs_output, num_clusters, num_iters, num_samples, metric='l2sqr', local_json_output=None, image_hashes=None, **kw): for cur_iter_num in range(num_iters): clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters) clusters_fn = os.path.basename(clusters_fp.name) cur_output = '%s/clust%.6d' % (hdfs_output, cur_iter_num) picarus._launch_frozen(hdfs_input, cur_output, _lf('kmeans.py'), cmdenvs=['CLUSTERS_FN=%s' % clusters_fn], files=[clusters_fp.name], num_reducers=max(1, num_clusters / 2), dummy_arg=clusters_fp) hdfs_prev_clusters = cur_output print('Clusters[%s]' % hdfs_prev_clusters) # Compute K-Means assignment/samples clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters) clusters_fn = os.path.basename(clusters_fp.name) cur_output = '%s/partition' % hdfs_output picarus._launch_frozen([hdfs_input, hdfs_image_data], cur_output, _lf('kmeans_partition.py'), cmdenvs=['CLUSTERS_FN=%s' % clusters_fn], files=[clusters_fp.name], num_reducers=max(1, num_clusters / 2), image_hashes=image_hashes, dummy_arg=clusters_fp) cur_output = '%s/assign' % hdfs_output picarus._launch_frozen(hdfs_input, cur_output, _lf('kmeans_assign.py'), cmdenvs=['CLUSTERS_FN=%s' % clusters_fn, 'NUM_SAMPLES=%d' % num_samples, 'mapred.text.key.partitioner.options=-k1'], files=[clusters_fp.name], num_reducers=max(1, num_clusters / 2), partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner', dummy_arg=clusters_fp) print('Assignment[%s]' % cur_output) # Filter the samples assignments_fp = fetch_assignments_from_hdfs(cur_output) assignments_fn = os.path.basename(assignments_fp.name) cur_output = '%s/samples' % hdfs_output picarus._launch_frozen(hdfs_image_data, cur_output, _lf('filter_samples.py'), cmdenvs=['ASSIGNMENTS_FN=%s' % os.path.basename(assignments_fn)], files=[assignments_fp.name], reducer=None, dummy_arg=assignments_fp) print('Samples[%s]' % cur_output)
def run_join_predictions_by_class(hdfs_input, hdfs_output, **kw): picarus._launch_frozen(hdfs_input, hdfs_output, _lf('join_predictions_by_class.py'))
def run_assemble_kernels(hdfs_input, hdfs_output, **kw): picarus._launch_frozen(hdfs_input, hdfs_output, _lf('assemble_kernels.py'), jobconfs_default=['mapred.task.timeout=6000000'], **kw)
def run_whiten(hdfs_input, hdfs_output, image_hashes=None, **kw): picarus._launch_frozen(hdfs_input, hdfs_output, _lf('whiten.py'), image_hashes=image_hashes)
def run_multiple_kernel_combine(hdfs_input, hdfs_output, **kw): picarus._launch_frozen(hdfs_input, hdfs_output, _lf('multiple_kernel_combine.py'), jobconfs_default=['mapred.task.timeout=6000000'], **kw)
def run_sample(hdfs_input, hdfs_output, num_clusters, **kw): picarus._launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'), cmdenvs=['SAMPLE_SIZE=%d' % num_clusters])
def make_thumbnails(hdfs_input, hdfs_output, thumb_size, image_type, **kw): script = 'make_thumbnails.py' picarus._launch_frozen(hdfs_input, hdfs_output, _lf(script), cmdenvs=['THUMB_SIZE=%d' % thumb_size, 'IMAGE_TYPE=%s' % image_type])