def _read_files(fns, prev_hashes, hdfs_output, output_format, max_record_size): """ Args: fns: Iterator of file names prev_hashes: Set of hashes (they will be skipped), this is used to make the data unique Yields: Tuple of (data_hash, data) where data_hash is a sha1 hash """ for fn in fns: sha1_hash = _sha1(fn) if sha1_hash not in prev_hashes: prev_hashes.add(sha1_hash) if output_format == 'record' and max_record_size is not None and max_record_size < os.stat(fn)[6]: # Put the file into the remote location hdfs_path = hadoopy.abspath('%s/_blobs/%s_%s' % (hdfs_output, sha1_hash, os.path.basename(fn))) data = '' hadoopy.put(fn, hdfs_path) else: hdfs_path = '' data = open(fn).read() if output_format == 'kv': yield sha1_hash, data elif output_format == 'record': out = {'sha1': sha1_hash, 'full_path': fn, 'extension': os.path.splitext(fn)[1][1:]} if data: out['data'] = data if hdfs_path: out['hdfs_path'] = hdfs_path yield sha1_hash, out
def hdfs_temp(hdfs_temp_dir=None): if hdfs_temp_dir is None: hdfs_temp_dir = HDFS_TEMP_DIR temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random())) yield temp_path if hadoopy.exists(temp_path): hadoopy.rmr(temp_path)
def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def test_err(self): nonsense_path = 'sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk' self.assertFalse(hadoopy.exists(nonsense_path)) self.assertEquals( hadoopy.abspath(nonsense_path).rsplit('/')[-1], nonsense_path) self.assertRaises(IOError, hadoopy.ls, nonsense_path) self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw): fn = "out-%f-%s" % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + ".out" print(os.path.abspath(".")) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher( in_path, out_path, script_name, jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"], **kw ) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == "launch_frozen_cmd": cmd = ( 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) ) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError("Launcher not recognized") wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc["the"], 1664) self.assertEqual(wc["Alice"], 221)
def exemplar_boxes(hdfs_input, hdfs_output): exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]' st = time.time() exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/' for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'): (image_id, box, score), _ = kv if exemplar_name == '%s-%s' % (image_id, box): print('Found it') with open('exemplars-patch.pkl', 'w') as fp: pickle.dump([kv], fp, -1) hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'], num_reducers=10) hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'], num_reducers=10) with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])] neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])] for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)): image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'}) out_dir = 'exemplars_similar_cropped/' try: shutil.rmtree('exemplars_similar_cropped') except OSError: pass print('Outputting cropped') os.makedirs(out_dir) print(exemplar_out + 'boxes_cropped') for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'): open(out_dir + x, 'w').write(y) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'}) out_dir = 'exemplars_similar/' try: shutil.rmtree('exemplars_similar') except OSError: pass print('Outputting boxes') os.makedirs(out_dir) for x, y in hadoopy.readtb(exemplar_out + 'boxes'): open(out_dir + x, 'w').write(y)
def exemplar_boxes(hdfs_input, hdfs_output): exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]' st = time.time() exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/' for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'): (image_id, box, score), _ = kv if exemplar_name == '%s-%s' % (image_id, box): print('Found it') with open('exemplars-patch.pkl', 'w') as fp: pickle.dump([kv], fp, -1) hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=[ 'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box' ], files=['exemplars-patch.pkl'], num_reducers=10) hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=[ 'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box' ], files=['exemplars-patch.pkl'], num_reducers=10) with open('image_box_fns.pkl', 'w') as fp: image_box_fns = {} pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted( hadoopy.readtb(exemplar_out + 'val_pos').next()[1])] neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted( hadoopy.readtb(exemplar_out + 'val_neg').next()[1])] for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)): image_box_fns.setdefault(image_id, []).append( (box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score))) pickle.dump(image_box_fns, fp, -1) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'}) out_dir = 'exemplars_similar_cropped/' try: shutil.rmtree('exemplars_similar_cropped') except OSError: pass print('Outputting cropped') os.makedirs(out_dir) print(exemplar_out + 'boxes_cropped') for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'): open(out_dir + x, 'w').write(y) hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'}) out_dir = 'exemplars_similar/' try: shutil.rmtree('exemplars_similar') except OSError: pass print('Outputting boxes') os.makedirs(out_dir) for x, y in hadoopy.readtb(exemplar_out + 'boxes'): open(out_dir + x, 'w').write(y)
def test_err(self): nonsense_path = "sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk" self.assertFalse(hadoopy.exists(nonsense_path)) self.assertEquals(hadoopy.abspath(nonsense_path).rsplit("/")[-1], nonsense_path) self.assertRaises(IOError, hadoopy.ls, nonsense_path) self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def canonicalize_path(path): import hadoopy return hadoopy.abspath(path)