Пример #1
0
def run_face_finder(hdfs_input, hdfs_output, image_length, boxes, **kw):
    cmdenvs = ['IMAGE_LENGTH=%d' % image_length]
    if boxes:
        cmdenvs.append('OUTPUT_BOXES=True')
    hadoopy.launch_frozen(hdfs_input, hdfs_output, 'face_finder.py', reducer=False,
                          cmdenvs=cmdenvs,
                          files=['haarcascade_frontalface_default.xml'])
def main():
#    in_path = '/tmp/bwhite/input/1266413011.32-003-fn-data'
#    out_path = '/tmp/bwhite/output/tp/face/run-%f' % time.time()
    in_path = '/user/brandyn/flickr/voc07/flickr_photo_conv4-1297714349.074514'
    out_path = '/user/brandyn/tp/facefinder/run-%f' % time.time()
    hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False,
                          files=['haarcascade_frontalface_default.xml'])
Пример #3
0
def output_exemplars(hdfs_input,
                     hdfs_output,
                     num=2,
                     output_type='box',
                     output_path='exemplars'):
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        for (image_id, box,
             score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num):
            image_box_fns.setdefault(image_id, []).append(
                (box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-tr',
                          hdfs_output + 'exemplars-%d-clip' % num,
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs=['TYPE=%s' % output_type])
    try:
        shutil.rmtree(output_path)
    except OSError:
        pass
    os.makedirs(output_path)
    for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num):
        open(output_path + '/%s' % (x, ), 'w').write(y)
Пример #4
0
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None):
    """Return an iterator of m kv pairs selected uniformly from the input

    Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p.
    If more kv pairs are returned from Hadoop, then they are ignored.  The resulting kv pairs
    are uniformly random from the input.

    Args:
        m: Desired number of samples (you will get this many as long as n >= m with probability (1-p))
        n: Number of total values (default None uses count_kvs to compute this)
        p: Failure probability (default .01 means there is 1 failure out of 100 runs)

    Yields:
        Sample k/v pairs
    """
    if n is None:
        n = count_kvs(hdfs_input)
    alpha = _random_sample_alpha(n, m, p=p)
    num_outputs = 0
    with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'),
                              cmdenvs={'ALPHA': alpha})
        for kv in hadoopy.readtb(hdfs_output):
            if num_outputs >= m:
                return
            yield kv
            num_outputs += 1
    if num_outputs < m:
        logging.warn('random_sampler: num_outputs[%d] when m[%d].  To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
Пример #5
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__)
     si.feature = json.dumps(self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1.,
                                                                                                                   'HASH_BITS': 128})
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
Пример #6
0
def cluster(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          hdfs_output + 'val_pred_pos',
                          'predict_spatial_pyramid_fine.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl'],
                          remove_output=True,
                          files=['exemplars.pkl'],
                          num_reducers=1)
Пример #7
0
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    files.append(local_labels)
    hadoopy.launch_frozen(hdfs_input, hdfs_output, 'train_classifier.py',
                          files=files,
                          cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)])
Пример #8
0
 def _run_face(self, fn, **kw):
     in_path = self.data_path + fn
     out_path = "%sout-%s-%f" % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw)
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + "img%.8d.jpg" % num, "w") as fp:
             fp.write(image_data)
Пример #9
0
 def _run_face(self, fn, out_path, **kw):
     in_path = self.data_path + fn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw)
     for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Пример #10
0
 def _run_face(self, fn):
     in_path = self.data_path + fn
     out_path = self.data_path + 'out-' + fn
     cmd = 'hadoop fs -put %s %s' % (fn, in_path)
     subprocess.check_call(cmd.split())
     hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False, files=['haarcascade_frontalface_default.xml'])
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + 'img%.8d.jpg' % num, 'w') as fp:
             fp.write(image_data)
Пример #11
0
def run_thresh_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, class_name, class_thresh, output_class, **kw):
    inputs = [hdfs_predictions_input]
    if isinstance(hdfs_input, list):
        inputs += hdfs_input
    else:
        inputs.append(hdfs_input)
    hadoopy.launch_frozen(inputs, hdfs_output, 'thresh_predictions.py',
                          cmdenvs=['CLASSIFIER_NAME=%s' % class_name,
                                   'CLASSIFIER_THRESH=%f' % class_thresh,
                                   'OUTPUT_CLASS=%d' % output_class])
Пример #12
0
 def _run(self, fn):
     in_path = self.data_path + fn
     out_path = self.data_path + 'out-' + fn
     cmd = 'hadoop fs -put %s %s' % (fn,  in_path)
     subprocess.check_call(cmd.split())
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     hadoopy.launch_frozen(in_path, out_path, 'wc.py', jobconfs='mapred.min.split.size=100000000')
     wc = dict(hadoopy.cat(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Пример #13
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-qrr%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    jobconfs = []

    # determine the split size
    if 'split_size' in args:
        splitsize = args['split_size']
        jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' +
                        str(splitsize))

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput
            mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper'
        else:
            mapper = True  # use the command line mapper

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            hadoopy.launch_frozen(input,
                                  curoutput,
                                  __file__,
                                  mapper=mapper,
                                  cmdenvs=gopts.cmdenv(),
                                  num_reducers=int(step),
                                  jobconfs=jobconfs)
Пример #14
0
def gen_data(num_clusters, num_points, num_dims):
    hadoopy.launch_frozen(in_name='/tmp/bwhite/input/synth_clusters/dummy',
                          out_name='/tmp/bwhite/input/synth_clusters/%d-%d-%d' % (num_clusters, num_points, num_dims),
                          script_path='generate_data.py',
                          remove_dir=True,
                          cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters),
                                   'NUM_POINTS=%d' % (num_points),
                                   'NUM_DIMS=%d' % (num_dims)],
                          #reducer=None,
                          jobconfs='mapred.reduce.tasks=30',
                          frozen_path='frozen')
Пример #15
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = "hadoopy-test-data/%f/" % cur_time
    print("Storing HDFS temp files and output in [%s]" % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + "out-" + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print("Launching job [%s]" % script_name)
    hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + "target.jpg"])
    print("Storing local output in [%s]" % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open("%s%s-img%.8d-%s.jpg" % (local_out, script_name, num, image_name), "w").write(image_data)
Пример #16
0
def launch_frozen(in_name,
                  out_name,
                  script_path,
                  hbase_in=True,
                  hbase_out=False,
                  columns=(),
                  start_row=None,
                  stop_row=None,
                  single_value=None,
                  **kw):
    _launch_args(hbase_in, hbase_out, columns, start_row, stop_row,
                 single_value, kw)
    hadoopy.launch_frozen(in_name, out_name, script_path, **kw)
Пример #17
0
def run_predict_classifier(hdfs_input, hdfs_classifier_input, hdfs_output, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
    print('------------------------BEFORE READTB')
    file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name)
    print('------------------------AFTER  READTB [%s, %s]' % (fp.name, os.path.exists(fp.name)))
    files.append(fp.name)
    hadoopy.launch_frozen(hdfs_input, hdfs_output, 'predict_classifier.py',
                          files=files, reducer=None,
                          cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)],
                          dummy_arg=fp)
Пример #18
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-normal%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            if i > 0:
                mapper = "org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      mapper=mapper,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
Пример #19
0
def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'):
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type])
    try:
        shutil.rmtree(output_path)
    except OSError:
        pass
    os.makedirs(output_path)
    for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num):
        open(output_path + '/%s' % (x,), 'w').write(y)
Пример #20
0
def run_video_keyframe(hdfs_input, hdfs_output, min_resolution, max_resolution, ffmpeg, **kw):
    if not ffmpeg:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, 'video_keyframe.py',
                              reducer=None,
                              cmdenvs=['MIN_RESOLUTION=%d' % min_resolution,
                                       'MAX_RESOLUTION=%f' % max_resolution])
    else:
        fp = vidfeat.freeze_ffmpeg()
        hadoopy.launch_frozen(hdfs_input, hdfs_output, 'video_keyframe.py',
                              reducer=None,
                              cmdenvs=['MIN_RESOLUTION=%d' % min_resolution,
                                       'MAX_RESOLUTION=%f' % max_resolution],
                              files=fp.__enter__(),
                              dummy_arg=fp)
Пример #21
0
 def _run_face(self, fn, out_path, **kw):
     bfn = os.path.basename(fn)
     in_path = self.data_path + bfn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path,
                           hdfs_out_path,
                           'face_finder.py',
                           files=['haarcascade_frontalface_default.xml'],
                           **kw)
     for num, ((image_name, box),
               image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Пример #22
0
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw):
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    hdfs_output_pos = hdfs_output + '/pos'
    hdfs_output_neg = hdfs_output + '/neg'
    hadoopy.launch_frozen(hdfs_input_pos, hdfs_output_pos, 'collect_keys.py')
    hadoopy.launch_frozen(hdfs_input_neg, hdfs_output_neg, 'collect_keys.py')
    pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
    neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])
    labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys},
                               'classifier': classifier,
                               'classifier_extra': classifier_extra}
    file_parse.dump(labels, local_labels)
Пример #23
0
def main():
    dense_path = 'exemplarbank/output/1341790878.92/pos'
    image_path = 'exemplarbank/data/sun_labelme_person/1-tr'
    image_box_fns = {}
    id_box_features = dict(hash_features(dense_path))
    print id_box_features.items()[0]
    for (image_id, box), feature in id_box_features.items():
        image_box_fns.setdefault(image_id, []).append((box, (image_id, box)))
    with open('image_box_fns.pkl', 'w') as fp:
        pickle.dump(image_box_fns, fp, -1)
    with hadoopy_helper.hdfs_temp() as hdfs_output:
        hadoopy.launch_frozen(image_path, hdfs_output, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True,
                              cmdenvs=['TYPE=feature'])
        id_box_features2 = dict(hash_features(hdfs_output))
        with open('compare.pkl', 'w') as fp:
            pickle.dump((id_box_features, id_box_features2), fp, -1)
Пример #24
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print('Launching job [%s]' % script_name)
    hadoopy.launch_frozen(in_path,
                          out_path,
                          script_name,
                          files=[data_path + 'target.jpg'])
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name),
             'w').write(image_data)
Пример #25
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args
    
    mat = args.get('mat',None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")
        
    input = mat
    matname,matext = os.path.splitext(mat)
    
    gopts.getintkey('blocksize',3)
    schedule = gopts.getstrkey('reduce_schedule','1')

    # clear the output
    output = args.get('output','%s-normal%s'%(matname,matext))
    if hadoopy.exists(output):
        print "Removing %s"%(output)
        hadoopy.rm(output)
    
    outputnamefunc = lambda x: output+"_iter%i"%(x)
    steps = schedule.split(',')
        
    for i,step in enumerate(steps):
        if i>0:
            input = curoutput
            
        if i+1==len(steps):
            curoutput = output
        else:
            curoutput = output+"_iter%i"%(i+1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)
            
        gopts.setkey('iter',i)
            
        if launch:
            if i>0:
                mapper="org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    mapper=mapper,
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
Пример #26
0
def run_join_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, local_image_output, **kw):
    inputs = [hdfs_predictions_input]
    if isinstance(hdfs_input, list):
        inputs += hdfs_input
    else:
        inputs.append(hdfs_input)
    hadoopy.launch_frozen(inputs, hdfs_output, 'join_predictions.py')
    if local_image_output:
        for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_output):
            for classifier, preds in classifier_preds.items():
                for conf, label in preds:
                    path = '%s/%s/label_%d/%8.8f-%s.jpg' % (local_image_output, classifier, label, conf, image_hash)
                    try:
                        os.makedirs(os.path.dirname(path))
                    except OSError:
                        pass
                    with open(path, 'w') as fp:
                        fp.write(image_data)
Пример #27
0
def _run_face(fn):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    cmd = 'hadoop fs -put %s %s' % (fn, in_path)
    subprocess.check_call(cmd.split())
    hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', files=[data_path + 'haarcascade_frontalface_default.xml'])
    local_out = 'out-%f' % cur_time
    try:
        os.makedirs(local_out)
    except OSError:
        pass
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
        image = np.asarray(Image.open(StringIO.StringIO(image_data)))
        for (x, y, w, h), n in faces:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 3)
        cv2.imwrite('%s/img%.8d.jpg' % (local_out, num), image[:, :, ::-1].copy())
Пример #28
0
def _run_face(fn):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    cmd = 'hadoop fs -put %s %s' % (fn, in_path)
    subprocess.check_call(cmd.split())
    hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', files=[data_path + 'haarcascade_frontalface_default.xml'])
    local_out = 'out-%f' % cur_time
    try:
        os.makedirs(local_out)
    except OSError:
        pass
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
        image = np.asarray(Image.open(StringIO.StringIO(image_data)))
        for (x, y, w, h), n in faces:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 3)
        cv2.imwrite('%s/img%.8d.jpg' % (local_out, num), image[:, :, ::-1].copy())
Пример #29
0
def main():
    dense_path = 'exemplarbank/output/1341790878.92/pos'
    image_path = 'exemplarbank/data/sun_labelme_person/1-tr'
    image_box_fns = {}
    id_box_features = dict(hash_features(dense_path))
    print id_box_features.items()[0]
    for (image_id, box), feature in id_box_features.items():
        image_box_fns.setdefault(image_id, []).append((box, (image_id, box)))
    with open('image_box_fns.pkl', 'w') as fp:
        pickle.dump(image_box_fns, fp, -1)
    with hadoopy_helper.hdfs_temp() as hdfs_output:
        hadoopy.launch_frozen(image_path,
                              hdfs_output,
                              'clip_boxes.py',
                              files=['image_box_fns.pkl'],
                              remove_output=True,
                              cmdenvs=['TYPE=feature'])
        id_box_features2 = dict(hash_features(hdfs_output))
        with open('compare.pkl', 'w') as fp:
            pickle.dump((id_box_features, id_box_features2), fp, -1)
Пример #30
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__,
                          self.__class__.__name__)
     si.feature = json.dumps(
         self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path,
                                        hdfs_output + '/clean',
                                        max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean',
                                          hdfs_output + '/feature',
                                          self.feature_dict,
                                          files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/hasher',
                               _lf('train_hasher.py'),
                               cmdenvs={
                                   'KV_PROB': 1.,
                                   'HASH_BITS': 128
                               })
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/db',
                               _lf('build_db.py'),
                               files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(
             hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
Пример #31
0
def initial_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True)
    hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True)
    # Compute desired probability
    num_val = 5000
    num_neg_train = 5000
    toggle_launch()
    if 0:
        neg_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train))
        neg_samples = [x[1] for x in neg_samples]
        with open('neg_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[num_val:]), fp, -1)
        with open('neg_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[:num_val]), fp, -1)
        del neg_samples
        gc.collect()
        pos_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'pos', num_val / 2))  # Twice as many neg as positive
        pos_samples = [x[1] for x in pos_samples]
        with open('pos_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(pos_samples), fp, -1)
        del pos_samples
    gc.collect()
    cmdenvs = {'NEG_FEATS': 'neg_feats.pkl',
               'POS_VAL_FEATS': 'pos_val_feats.pkl',
               'NEG_VAL_FEATS': 'neg_val_feats.pkl'}
    files = cmdenvs.values()
    cmdenvs['SAMPLE_SIZE'] = 1000
    hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py',
                          cmdenvs=cmdenvs, remove_output=True, files=files)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Пример #32
0
def calibrate(hdfs_input, hdfs_output):
    # Predict on pos/neg sets
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          hdfs_output + 'val_pos',
                          'image_predict.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'],
                          remove_output=True,
                          num_reducers=10,
                          files=['exemplars.pkl'])
    hadoopy.launch_frozen(hdfs_input + '0-v',
                          hdfs_output + 'val_neg',
                          'image_predict.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'],
                          remove_output=True,
                          num_reducers=10,
                          files=['exemplars.pkl'])
    # Calibrate threshold using pos/neg validation set #1
    hadoopy.launch_frozen([
        hdfs_output + 'val_neg', hdfs_output + 'val_pos',
        hdfs_output + 'exemplars-1'
    ],
                          hdfs_output + 'exemplars-2',
                          'calibrate_thresholds.py',
                          num_reducers=50,
                          remove_output=True)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Пример #33
0
def run_kmeans(hdfs_input, hdfs_prev_clusters, hdfs_image_data, hdfs_output, num_clusters,
               num_iters, num_samples, metric, local_json_output=None, **kw):
    frozen_tar_path = None
    for cur_iter_num in range(num_iters):
        clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters)
        clusters_fn = os.path.basename(clusters_fp.name)
        cur_output = '%s/clust%.6d' % (hdfs_output, cur_iter_num)
        frozen_tar_path = hadoopy.launch_frozen(hdfs_input, cur_output, 'kmeans.py',
                                                cmdenvs=['CLUSTERS_FN=%s' % clusters_fn],
                                                files=[clusters_fp.name],
                                                num_reducers=max(1, num_clusters / 2),
                                                frozen_tar_path=frozen_tar_path,
                                                dummy_arg=clusters_fp)['frozen_tar_path']
        hdfs_prev_clusters = cur_output
    print('Clusters[%s]' % hdfs_prev_clusters)
    # Compute K-Means assignment/samples
    # TODO Do full assignment, then sample
    clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters)
    clusters_fn = os.path.basename(clusters_fp.name)
    cur_output = '%s/assign' % hdfs_output
    hadoopy.launch_frozen(hdfs_input, cur_output, 'kmeans_assign.py',
                          cmdenvs=['CLUSTERS_FN=%s' % clusters_fn,
                                   'NUM_SAMPLES=%d' % num_samples,
                                   'mapred.text.key.partitioner.options=-k1'],
                          files=[clusters_fp.name],
                          num_reducers=max(1, num_clusters / 2),
                          partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
                          dummy_arg=clusters_fp)
    print('Assignment[%s]' % cur_output)
    # Filter the samples
    assignments_fp = fetch_assignments_from_hdfs(cur_output)
    assignments_fn = os.path.basename(assignments_fp.name)
    cur_output = '%s/samples' % hdfs_output
    hadoopy.launch_frozen(hdfs_image_data, cur_output, 'filter_samples.py',
                          cmdenvs=['ASSIGNMENTS_FN=%s' % os.path.basename(assignments_fn)],
                          files=[assignments_fp.name],
                          reducer=None,
                          dummy_arg=assignments_fp)
    print('Samples[%s]' % cur_output)
def run_face_ranker(hdfs_input, hdfs_output,
                    feature_pkl, exemplar_fn):
    """
    Runs the face_ranker.py hadoopy script.  The output consists of
    the distance of each image to an exemplar as key, and the
    input tuple of (key, imagedata) as value.
    Inputs:
    - hdfs_input: path to hdfs input: (key, imagedata) pairs
    - hdfs_output: path to the hdfs output tuples: (dist, (key, imagedata))
      where dist is the distance in Eigenfaces feature space to the exemplar
      image
    - feature_pkl: pickle file containing a trained Eigenfaces feature
    - exemplar_fn: filename of the exemplar image
    """
    fp = tempfile.NamedTemporaryFile()
    _compute_exemplar_feature(exemplar_fn, feature_pkl, fp)
    fp.flush()
    hadoopy.launch_frozen(hdfs_input, hdfs_output,
                          'face_ranker.py',
                          cmdenvs=['EXEMPLAR_FN=%s' % os.path.basename(fp.name),
                                   'FEATURE_FN=%s' % os.path.basename(feature_pkl)],
                          files=[feature_pkl, fp.name])
Пример #35
0
def main(input_path, output_path, num_clusters, cluster_path, num_reducers):
    def inc_path():
        global iter_cnt
        iter_cnt +=1
        return '%s/%d' % (output_path, iter_cnt)
    def prev_path():
        return '%s/%d' % (output_path, iter_cnt)
    consolidate_clusters(cluster_path, 'clusters.pkl')
    if 1:
        hadoopy.launch_frozen(in_name=input_path,
                              out_name=inc_path(),
                              script_path='kmeans_cluster_single.py',
                              reducer=None,
                              cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'),
                                       'NN_MODULE=nn_l2sqr_c'],
                              #combiner=True,
                              files=['nn_l2sqr_c.py','clusters.pkl'],
                              shared_libs=SHARED_LIBS,
                              modules=['vitrieve_algorithms', 'nn_l2sqr_c',],
                              remove_dir=True,
                              jobconfs=['mapred.min.split.size=999999999999',
                                        'mapred.reduce.tasks=%d' % (num_reducers)])
Пример #36
0
def calibrate(hdfs_input, hdfs_output):
    # Predict on pos/neg sets
    hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl'])
    hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl'])
    # Calibrate threshold using pos/neg validation set #1
    hadoopy.launch_frozen([hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1'], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Пример #37
0
def flickr_images(tags,
                  images_per_tag,
                  hdfs_output,
                  num_files=20,
                  max_iters=1,
                  max_pages=1,
                  output_meta=False,
                  api_key=None,
                  api_secret=None,
                  remove_output=False):
    tags = list(tags)
    if api_key is None or api_secret is None:
        api_key = os.environ['FLICKR_API_KEY']
        api_secret = os.environ['FLICKR_API_SECRET']
    tags_per_chunk = max(len(tags) / num_files, 1)
    if remove_output and hadoopy.exists(hdfs_output):
        print('Removing output dir[%s]' % hdfs_output)
        hadoopy.rmr(hdfs_output)
    cmdenvs = {
        'FLICKR_API_KEY': api_key,
        'FLICKR_API_SECRET': api_secret,
        'MAX_ITERS': str(max_iters),
        'MAX_PAGES': str(max_pages)
    }
    for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)):
        hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num,
                        [(images_per_tag, tag) for tag in chunk_tags])
    hadoopy.launch_frozen(hdfs_output + '/tags',
                          hdfs_output + '/metadata',
                          _lf('flickr_bulk.py'),
                          cmdenvs=cmdenvs,
                          num_reducers=num_files)
    output_type = 'meta' if output_meta else 'image'
    hadoopy.launch_frozen(hdfs_output + '/metadata',
                          hdfs_output + '/image_metadata',
                          _lf('file_downloader.py'),
                          cmdenvs={'OUTPUT_TYPE': output_type})
Пример #38
0
def compute_database(flickr_data):
    r = 'image_search/%f/' % time.time()
    f_path = r + 'features/'
    m_path = r + 'median/'
    h_path = r + 'hashes/'
    j_path = r + 'hash_metadata/'
    hadoopy.launch_frozen(flickr_data, f_path, 'build_features.py')
    hadoopy.launch_frozen(f_path, m_path, 'calc_median_feature.py')
    median = np.array([x for _, x in sorted(hadoopy.readtb(m_path))])
    pickle.dump(median, open('median.pkl', 'w'), -1)
    hadoopy.launch_frozen(f_path, h_path, 'compute_hashes.py', files=['median.pkl'])
    hadoopy.launch_frozen([h_path, flickr_data], j_path, 'join.py',
                          num_reducers=10)
    hashes, metadatas = zip(*[x[1] for x in hadoopy.readtb(j_path)])
    hashes = np.array([x.ravel() for x in hashes])
    with open('database.pkl', 'w') as fp:
        pickle.dump((hashes, metadatas, median), fp, -1)
def run_face_ranker(hdfs_input, hdfs_output, feature_pkl, exemplar_fn):
    """
    Runs the face_ranker.py hadoopy script.  The output consists of
    the distance of each image to an exemplar as key, and the
    input tuple of (key, imagedata) as value.
    Inputs:
    - hdfs_input: path to hdfs input: (key, imagedata) pairs
    - hdfs_output: path to the hdfs output tuples: (dist, (key, imagedata))
      where dist is the distance in Eigenfaces feature space to the exemplar
      image
    - feature_pkl: pickle file containing a trained Eigenfaces feature
    - exemplar_fn: filename of the exemplar image
    """
    fp = tempfile.NamedTemporaryFile()
    _compute_exemplar_feature(exemplar_fn, feature_pkl, fp)
    fp.flush()
    hadoopy.launch_frozen(hdfs_input,
                          hdfs_output,
                          'face_ranker.py',
                          cmdenvs=[
                              'EXEMPLAR_FN=%s' % os.path.basename(fp.name),
                              'FEATURE_FN=%s' % os.path.basename(feature_pkl)
                          ],
                          files=[feature_pkl, fp.name])
Пример #40
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
Пример #41
0
def initial_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'neg',
                          'compute_exemplar_features.py',
                          remove_output=True)
    hadoopy.launch_frozen(hdfs_input + '1-tr',
                          hdfs_output + 'pos',
                          'compute_exemplar_features.py',
                          remove_output=True)
    # Compute desired probability
    num_val = 5000
    num_neg_train = 5000
    toggle_launch()
    if 0:
        neg_samples = list(
            hadoopy_helper.jobs.random_sample(hdfs_output + 'neg',
                                              num_val + num_neg_train))
        neg_samples = [x[1] for x in neg_samples]
        with open('neg_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[num_val:]), fp, -1)
        with open('neg_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[:num_val]), fp, -1)
        del neg_samples
        gc.collect()
        pos_samples = list(
            hadoopy_helper.jobs.random_sample(
                hdfs_output + 'pos',
                num_val / 2))  # Twice as many neg as positive
        pos_samples = [x[1] for x in pos_samples]
        with open('pos_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(pos_samples), fp, -1)
        del pos_samples
    gc.collect()
    cmdenvs = {
        'NEG_FEATS': 'neg_feats.pkl',
        'POS_VAL_FEATS': 'pos_val_feats.pkl',
        'NEG_VAL_FEATS': 'neg_val_feats.pkl'
    }
    files = cmdenvs.values()
    cmdenvs['SAMPLE_SIZE'] = 1000
    hadoopy.launch_frozen(hdfs_output + 'pos',
                          hdfs_output + 'exemplars-0',
                          'uniform_selection.py',
                          cmdenvs=cmdenvs,
                          remove_output=True,
                          files=files)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Пример #42
0
def _launch_frozen(in_path, out_path, script_path, jobconfs_default=(), *args, **kw):
    import hadoopy
    import os
    kw = dict(kw)  # Make a copy as we will be mutating it
    kw['frozen_tar_path'] = _freeze_script(script_path)['frozen_tar_path']
    if 'reducer' not in kw and 'num_reducers' not in kw:
        kw['num_reducers'] = 1
    if 'jobconfs' in kw:
        kw['jobconfs'] = kw['jobconfs'] + GLOBAL_JOBCONFS
    else:
        kw['jobconfs'] = GLOBAL_JOBCONFS
    if 'jobconfs' not in kw:
        kw['jobconfs'] = []
    if jobconfs_default:
        jobconfs_dict = dict(x.split('=', 1) for x in kw['jobconfs'])
        jobconfs_default_dict = dict(x.split('=', 1) for x in jobconfs_default)
        for jobconf_name, jobconf_value in jobconfs_default_dict.items():
            if jobconf_name not in jobconfs_dict:
                jobconfs_dict[jobconf_name] = jobconf_value
        kw['jobconfs'] = ['%s=%s' % x for x in jobconfs_dict.items()]
    if 'image_hashes' in kw and kw['image_hashes'] is not None:
        import tempfile
        fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
        file_parse.dump(kw['image_hashes'], fp.name)
        try:
            kw['files'].append(fp.name)
        except KeyError:
            kw['files'] = [fp.name]
        try:
            kw['cmdenvs'].append('PICARUS_VALID_IMAGE_HASHES=%s' % os.path.basename(fp.name))
        except KeyError:
            kw['cmdenvs'] = ['PICARUS_VALID_IMAGE_HASHES=%s' % os.path.basename(fp.name)]
        kw['_internal_dummy_arg'] = fp  # Keep the object alive
        del kw['image_hashes']
        
    return hadoopy.launch_frozen(in_path, out_path, script_path, *args, **kw)
Пример #43
0
def hard_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'hard_neg',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          num_reducers=10,
                          files=['exemplars.pkl'],
                          remove_output=True)

    def _inner():
        with open('image_box_fns.pkl', 'w') as fp:
            image_box_fns = {}
            for (image_id, box,
                 score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
                for score2, image_id2, box2 in negs:
                    image_box_fns.setdefault(image_id2, []).append(
                        (box2, [image_id, box, score]))
            pickle.dump(image_box_fns, fp, -1)
        del image_box_fns
        gc.collect()

    _inner()
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'hard_neg_clip',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs=['TYPE=feature'])
    hadoopy.launch_frozen(
        [hdfs_output + 'pos_sample', hdfs_output + 'hard_neg_clip'],
        hdfs_output + 'exemplars-1',
        'train_exemplars_hard.py',
        cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'],
        files=['neg_feats.pkl'],
        remove_output=True,
        num_reducers=10)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Пример #44
0
def hard_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars.pkl',
                                                                                                         'MAX_HARD=100',
                                                                                                         'OUTPUT_FORMAT=score_image_box'],
                          num_reducers=10, files=['exemplars.pkl'], remove_output=True)

    def _inner():
        with open('image_box_fns.pkl', 'w') as fp:
            image_box_fns = {}
            for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
                for score2, image_id2, box2 in negs:
                    image_box_fns.setdefault(image_id2, []).append((box2, [image_id, box, score]))
            pickle.dump(image_box_fns, fp, -1)
        del image_box_fns
        gc.collect()
    _inner()
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg_clip', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature'])
    hadoopy.launch_frozen([hdfs_output + 'pos_sample',
                           hdfs_output + 'hard_neg_clip'], hdfs_output + 'exemplars-1', 'train_exemplars_hard.py',
                          cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'], files=['neg_feats.pkl'],
                          remove_output=True, num_reducers=10)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Пример #45
0
import hadoopy
import random
for i in range(5):
    prefix = str(random.random())
    print(prefix)
    hadoopy.launch_frozen('/tmp/bwhite/input/pets2006.video_frame_data.tb',
                          '/tmp/bwhite/output/pets2006.video_frame_data.b/' + prefix,
                          'bgsub.py',
                          partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
                          jobconfs=['mapred.text.key.partitioner.options=-k1,1',
                                    #'mapred.reduce.tasks=500',
                                    'mapred.min.split.size=999999999999'
                                    'mapred.reduce.tasks=1',
                                    'mapred.output.compress=true',
                                    'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'],
                          shared_libs=['libbgsub_fast.so'],
                          frozen_path='frozen') 
Пример #46
0
import hadoopy
import time

# Setup paths
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'wc-input'
output_path = data_path + 'wc-output'

# Write data to HDFS in the form of (term #, term)
input_data = enumerate(
    'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.'
    .split())
hadoopy.writetb(input_path, input_data)

# Launch the job
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Read the first KV pair
word_counts = dict(hadoopy.readtb(output_path))
for probe_word, expected_count in [('the', 6), ('Lorem', 4), ('of', 4)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
Пример #47
0
 def test_cluster_info(self):
     hadoopy.writetb(self.data_path + 'cluster_info_input', [(0, 0)])
     hadoopy.launch_frozen(self.data_path + 'cluster_info_input',
                           self.data_path + 'cluster_info',
                           'cluster_info.py')
     pprint.pprint(dict(hadoopy.readtb(self.data_path + 'cluster_info')))
Пример #48
0
def launch_frozen(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw):
    _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw)
    hadoopy.launch_frozen(in_name, out_name, script_path, **kw)
Пример #49
0
import time

# Setup paths
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'input'
output_path_a = data_path + 'output_a'
output_path_b = data_path + 'output_b'
output_path_c = data_path + 'output_c'
output_path_d = data_path + 'output_d'

# Write data to HDFS in the form of (term #, term)
input_data = [(1, 5), ('dsfs', {
    'a': 3
}), ([1, 2], 'sdflk')]  # Diverse KV input
hadoopy.writetb(input_path, input_data)

# Launch the jobs
hadoopy.launch_frozen(input_path, output_path_a, 'identity.py')
hadoopy.launch_frozen(input_path, output_path_b, 'identity.py')
hadoopy.launch_frozen(output_path_b, output_path_c, 'identity.py')
hadoopy.launch_frozen(
    [input_path, output_path_a, output_path_b, output_path_c], output_path_d,
    'identity.py')

# Read the first KV pair
print('KV Input[%s]' % str(hadoopy.readtb(input_path).next()))
print('KV Output a[%s]' % str(hadoopy.readtb(output_path_a).next()))
print('KV Output b[%s]' % str(hadoopy.readtb(output_path_b).next()))
print('KV Output c[%s]' % str(hadoopy.readtb(output_path_c).next()))
print('KV Output d[%s]' % str(hadoopy.readtb(output_path_d).next()))
from hadoopy import launch_frozen

input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy-frozen'

launch_frozen(
    input_path,
    output_path,
    'ngrams.py',
    use_seqoutput=False,
    num_reducers=10,
    hstreaming=
    '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar'
)
Пример #51
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output +
                                   'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          exemplar_out + 'val_pos',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v',
                          exemplar_out + 'val_neg',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1)
                     for score, image_id, box in sorted(
                         hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0)
                     for score, image_id, box in sorted(
                         hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box,
                  pol) in enumerate(sorted(pos_boxes + neg_boxes,
                                           reverse=True)):
            image_box_fns.setdefault(image_id, []).append(
                (box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'],
                          exemplar_out + 'boxes_cropped',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'],
                          exemplar_out + 'boxes',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
Пример #52
0
#hadoopy.launch_local(r + 'out/flickr_metadata', r + 'out/flickr_images', 'file_downloader.py',
#                     worker_queue_maxsize=10)  # , max_input=100
print('Downloaded images')
import glob
import shutil
import cv2
for fn in glob.glob('*.JPG'):
    img = cv2.imread(fn)
    img = cv2.resize(img, (int(img.shape[1] / 2.5), int(img.shape[0] / 2.5)))
    try:
        os.remove('target.jpg')
    except OSError:
        pass
    cv2.imwrite('target.jpg', img)
    hadoopy.launch_frozen([
        'flickr_data_picarus/run-1343747418.029870/out/down',
        'flickr_data_picarus/run-1343712226.822338/out/flickr_images'
    ],
                          r + 'tiles',
                          'picnic_job.py',
                          files=['target.jpg'],
                          remove_output=True)
    base = os.path.basename(fn) + '_tiles/'
    try:
        os.makedirs(base)
    except OSError:
        pass
    for k, v in hadoopy.readtb(r + 'tiles'):
        with open(base + k, 'w') as fp:
            fp.write(v)