def download(self): tar_name = self.url.rsplit("/", 1)[-1] def _report(count, block_size, total_size): perc = 100 * count * block_size / total_size sys.stdout.write("\r>> Getting %s %.1f%%" % (tar_name, perc)) sys.stdout.flush() tempd = tempfile.mkdtemp(prefix="pydeep_") tar_path = os.path.join(tempd, tar_name) tar_path, _ = urllib.request.urlretrieve(self.url, tar_path, _report) print() dest_dir = hdfs.path.dirname(self.path) if dest_dir: hdfs.mkdir(dest_dir) with tarfile.open(tar_path, "r:gz") as tar: try: info = tar.getmember(self.filename) except KeyError: raise ValueError("{} not found in {}".format( self.filename, tar_name)) f_in = tar.extractfile(info) with hdfs.open(self.path, "wb") as f_out: while True: chunk = f_in.read(PAGESIZE) if not chunk: break f_out.write(chunk) shutil.rmtree(tempd)
def run_alignments(bcl_output_dir, output_dir): sample_directories = _get_samples_from_bcl_output(bcl_output_dir) logger.info("Found %d samples in bcl output directory", len(sample_directories)) logger.debug("Making base output directory %s", output_dir) phdfs.mkdir(output_dir) # launch all the jobs base_cmd = [ get_exec('seal'), 'seqal', '--align-only', '-D', 'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']), '-D', 'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']), '--input-format', GlobalConf.get('seqal_input_fmt', 'prq'), '--output-format', GlobalConf.get('seqal_output_fmt', 'sam'), '--ref-archive', GlobalConf['reference_archive'], ] def start_job(sample_dir): sample_output_dir = phdfs.path.join(output_dir, os.path.basename(sample_dir)) cmd = base_cmd + [ sample_dir, sample_output_dir ] # LP: should refactor to start the job within the AlignJob object job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir) logger.info("Launching alignment of sample %s", os.path.basename(sample_dir)) logger.debug("executing command: %s", cmd) job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096) job.popen_obj.poll() logger.debug("job running with PID %d", job.popen_obj.pid) return job jobs = [ start_job(s) for s in sample_directories ] ok = _wait(jobs, GlobalConf['remove_output']) if not ok: errored_jobs = [ j for j in jobs if j.failed ] logger.error("%d alignment jobs failed", len(errored_jobs)) logger.error("Here are the return codes: %s", ', '.join([ str(j.retcode) for j in errored_jobs ])) raise RuntimeError("Some alignment jobs failed")
def _write(self, data): "Internal Write API" schema = self.schema wmaid = self.wmaid(data) year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) if not hdfs.path.isdir(hdir): hdfs.mkdir(hdir) fname = file_name(hdir, wmaid, self.compress) # create Avro writer and binary encoder writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() if self.compress: # use gzip'ed writer with BytesIO file object gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb') encoder = avro.io.BinaryEncoder(gzip_writer) else: # plain binary reader encoder = avro.io.BinaryEncoder(bytes_writer) # write records from given data stream to binary writer writer.write(data, encoder) # close gzip stream if necessary if self.compress: gzip_writer.flush() gzip_writer.close() # store raw data to hadoop via HDFS hdfs.dump(bytes_writer.getvalue(), fname) # close bytes stream bytes_writer.close()
def mk_hdfs_temp_dir(prefix): found = True while found: tmp = os.path.basename(tempfile.mktemp(prefix=prefix)) found = phdfs.path.exists(tmp) phdfs.mkdir(tmp) return tmp
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug( 'Generated pipes_code:\n\n %s', self._generate_pipes_code() ) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' '))
def _write(self, data): "Internal Write API" schema = self.schema wmaid = self.wmaid(data) year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) if not hdfs.path.isdir(hdir): hdfs.mkdir(hdir) fname = file_name(hdir, wmaid, self.compress) # create Avro writer and binary encoder writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() if self.compress: # use gzip'ed writer with BytesIO file object gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb') encoder = avro.io.BinaryEncoder(gzip_writer) else: # plain binary reader encoder = avro.io.BinaryEncoder(bytes_writer) # write records from given data stream to binary writer writer.write(data, encoder) # close gzip stream if necessary if self.compress: gzip_writer.flush() gzip_writer.close() # store raw data to hadoop via HDFS hdfs.dump(bytes_writer.getvalue(), fname) # close bytes stream bytes_writer.close()
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug('Generated pipes_code:\n\n %s', self._generate_pipes_code()) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' '))
def hdfs_file(odir, name): """ Given HDFS dir and file name create appropriate dir structure on HDFS and return full path of the file. We rely on odir/YYYY/MM/DD dir structure. """ tstamp = name.split('/')[-1].split('_')[ 0] # each file is in form YYYYMMDD_HHMM.ext if not PAT_YYYYMMDD.match(tstamp): raise Exception( "Given file name '%s' does not contain YYYYMMDD stamp" % name) year = tstamp[:4] if not PAT_YYYY.match(year): raise Exception("Given file name '%s' does not contain YYYY stamp" % name) month = tstamp[4:6] if not PAT_MM.match(month): raise Exception("Given file name '%s' does not contain MM stamp" % name) day = tstamp[6:8] if not PAT_DD.match(day): raise Exception("Given file name '%s' does not contain DD stamp" % name) if not hdfs.path.isdir(odir): hdfs.mkdir(odir) for subdir in [year, month, day]: odir = os.path.join(odir, subdir) if not hdfs.path.isdir(odir): hdfs.mkdir(odir) return os.path.join(odir, name)
def mkdir(self): for wd in self.local_wd, self.hdfs_wd: d1 = "%s/d1" % wd d2 = "%s/d2" % d1 hdfs.mkdir(d2) dir_list = hdfs.ls(d1) self.assertEqual(len(dir_list), 1) self.assertTrue(dir_list[0].endswith(d2))
def mkdir(self): for wd in self.local_wd, self.hdfs_wd: d1 = "%s/d1" % wd d2 = "%s/d2" % d1 hdfs.mkdir(d2) dir_list = hdfs.ls(d1) self.assertEqual(len(dir_list), 1) self.assertTrue(dir_list[0].endswith(d2))
def __init__(self, prefix=None, logger=None): self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def __init__(self, prefix=None, logger=None): self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def mk_hdfs_temp_dir(prefix): if not pydoop_here: raise NotImplementedError("Pydoop not available on this system") found = True while found: tmp = os.path.basename(tempfile.mktemp(prefix=prefix)) found = phdfs.path.exists(tmp) phdfs.mkdir(tmp) return tmp
def __cp_dir(self, wd): src_dir = "%s/src_dir" % wd hdfs.mkdir(src_dir) copy_on_wd = "%s/src_dir_copy" % wd copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd hdfs.cp(src_dir, copy_on_wd) self.assertTrue(hdfs.path.exists(copy_on_wd)) hdfs.cp(src_dir, copy_on_wd) self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd)) self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)
def __cp_dir(self, wd): src_dir = "%s/src_dir" % wd hdfs.mkdir(src_dir) copy_on_wd = "%s/src_dir_copy" % wd copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd hdfs.cp(src_dir, copy_on_wd, mode="wb") self.assertTrue(hdfs.path.exists(copy_on_wd)) hdfs.cp(src_dir, copy_on_wd, mode="wb") self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd)) self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)
def __make_tree(self, wd): d1 = "%s/d1" % wd t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) hdfs.dump(self.data, f) t.add(f, 0) return t1
def __make_tree(self, wd): d1 = "%s/d1" % wd t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) hdfs.dump(self.data, f) t.add(f, 0) return t1
def __init__(self, prefix=None, logger=None): hadoop_version_info = pydoop.hadoop_version_info() if hadoop_version_info.is_local(): raise pydoop.LocalModeNotSupported() self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def __init__(self, prefix=None, logger=None): hadoop_version_info = pydoop.hadoop_version_info() if hadoop_version_info.is_local(): raise pydoop.LocalModeNotSupported() self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn) dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd) self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir) self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def mapper(_, record, writer, conf): out_dir = conf.get('out.dir', utils.make_random_str()) if not hdfs.path.isdir(out_dir): hdfs.mkdir(out_dir) hdfs.chmod(out_dir, 'g+rwx') img_path = record.strip() a = get_array(img_path) out_a = calc_features(a) out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path)) with hdfs.open(out_path, 'w') as fo: np.save(fo, out_a) # actual output hdfs.chmod(out_path, 'g+rw') writer.emit(img_path, fo.name) # info (tab-separated input-output)
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn, mode="wb") dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def test_isdir(self): for path in self.path, self.u_path: self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_isdir(self): path = utils.make_random_str() self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): path = utils.make_random_str() self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): for path in self.path, self.u_path: self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): path = utils.make_random_str() self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_isdir(self): for path in self.path, self.u_path: self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_isdir(self): path = utils.make_random_str() self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): for path in self.path, self.u_path: self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def __init__(self, file_prefix, loadexist=False, readonly=False): CustomStorage.__init__(self) if not loadexist: if hdfs.path.exists('{0}_0'.format(file_prefix)): file_prefix += '_0' while hdfs.path.exists('{0}_0'.format(file_prefix)): insert_index = file_prefix.rfind('_') file_prefix = '{0}_{1}'.format(file_prefix[:insert_index], int(file_prefix[insert_index + 1:]) + 1) self.file_prefix = file_prefix self.read_only = readonly self.clear() logger.info('init hdfs storage from hdfs file_prefix {0}'.format(self.file_prefix)) try: total_start = timeit.default_timer() prefix_split = hdfs.path.splitpath(self.file_prefix) folder_path = prefix_split[0] real_prefix = prefix_split[1] + '_' if not hdfs.path.exists(folder_path): hdfs.mkdir(folder_path) files_info = hdfs.lsl(folder_path) # files_info = hdfs.lsl('{0}_*'.format(self.file_prefix)) logger.debug('files_info:{0}'.format(files_info)) sizecount = 0 for file_info in files_info: start_time = timeit.default_timer() file_name = hdfs.path.splitpath(file_info['path'])[1] if file_name.startswith(real_prefix) and file_info['kind'] == 'file': logger.debug('file info: {0}'.format(file_info)) page_id = file_name[len(real_prefix):] if not page_id.isdigit(): continue logger.debug('file {0} page id :{1}#'.format(file_info['path'], page_id)) # if page_id.isdigit(): logger.info('load {0}# page file {1}'.format(page_id, file_info['path'])) content = hdfs.load(file_info['path'], mode='r') # logger.debug('{0}# page content:{1}'.format(page_id, content)) self.pagedict[int(page_id)] = content logger.debug('{0}# page load complete'.format(page_id)) end_time = timeit.default_timer() eval(generate_timer_log_str.format( 'load {0} {1} byte'.format(file_name, len(self.pagedict[int(page_id)])), start_time, end_time)) sizecount += len(self.pagedict[int(page_id)]) except IOError, ie: logger.debug(traceback.format_exc())
def calc_bottlenecks(model, img_map, out_dir): projector = tflow.BottleneckProjector(model) for in_subd, img_paths in img_map.items(): cls = hdfs.path.basename(in_subd) out_subd = hdfs.path.join(out_dir, cls) hdfs.mkdir(out_subd) bnecks_path = hdfs.path.join(out_subd, BNECKS_BASENAME) LOGGER.info("computing bottlenecks for: %s", cls) with hdfs.open(bnecks_path, "wb") as out_f: for path in img_paths: with hdfs.open(path, "rb") as in_f: data = in_f.read() checksum = md5(data).digest() bneck = projector.project(data) out_f.write(checksum + bneck.tobytes()) projector.close_session()
def run_alignments(bcl_output_dir, output_dir): sample_directories = _get_samples_from_bcl_output(bcl_output_dir) logger.info("Found %d samples in bcl output directory", len(sample_directories)) logger.debug("Making base output directory %s", output_dir) phdfs.mkdir(output_dir) # launch all the jobs base_cmd = [ get_exec('seal'), 'seqal', '--align-only', '-D', 'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']), '-D', 'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']), '--input-format', GlobalConf.get('seqal_input_fmt', 'prq'), '--output-format', GlobalConf.get('seqal_output_fmt', 'sam'), '--ref-archive', GlobalConf['reference_archive'], ] def start_job(sample_dir): sample_output_dir = phdfs.path.join(output_dir, os.path.basename(sample_dir)) cmd = base_cmd + [sample_dir, sample_output_dir] # LP: should refactor to start the job within the AlignJob object job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir) logger.info("Launching alignment of sample %s", os.path.basename(sample_dir)) logger.debug("executing command: %s", cmd) job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096) job.popen_obj.poll() logger.debug("job running with PID %d", job.popen_obj.pid) return job jobs = [start_job(s) for s in sample_directories] ok = _wait(jobs, GlobalConf['remove_output']) if not ok: errored_jobs = [j for j in jobs if j.failed] logger.error("%d alignment jobs failed", len(errored_jobs)) logger.error("Here are the return codes: %s", ', '.join([str(j.retcode) for j in errored_jobs])) raise RuntimeError("Some alignment jobs failed")
def __make_tree(self, wd, root="d1", create=True): """ d1 |-- d2 | `-- f2 `-- f1 """ d1 = "%s/%s" % (wd, root) t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) if create: hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) if create: hdfs.dump(self.data, f, mode="wb") t.add(f, 0) return t1
def __make_tree(self, wd, root="d1", create=True): """ d1 |-- d2 | `-- f2 `-- f1 """ d1 = "%s/%s" % (wd, root) t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) if create: hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) if create: hdfs.dump(self.data, f, mode="wb") t.add(f, 0) return t1
def capture(outpath, max_count='3'): """ fab cam.capture:/tmp/cam1,3 """ max_count = int(max_count) import os import cv2 import copy import pydoop.hdfs as hdfs cv2.namedWindow('Window1') vc = cv2.VideoCapture() vc.open(0) skip = 50 max_count *= skip basename = os.path.basename(outpath) count = 1 hdfs.mkdir('hdfs://gnn-f02-01' + outpath) while True: retval, image = vc.read() try: if count % skip == 0: tmpImage = copy.copy(image) filename = '%05d.jpg' % (count / skip) hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals( ) cv2.putText(tmpImage, filename, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, 2) cv2.imshow('Windows1', tmpImage) cv2.waitKey(1) cv2.imwrite(basename + '_' + filename, image) hdfs.put(basename + '_' + filename, hdfspath) print basename + '_' + filename, hdfspath else: cv2.imshow('Windows1', image) cv2.waitKey(1) except KeyboardInterrupt: break count += 1 if 0 < max_count < count: break vc.release() cv2.destroyWindow('Window1')
def setup(self): """ * Creates an hdfs directory with the name of this test (self.make_hdfs_test_path()) * uploads the local 'input' directory into the hdfs directory """ self.logger.debug("Test setup") #hadut.run_hadoop_cmd_e("dfsadmin", args_list=["-safemode", "wait"]) #self.logger.debug("hdfs out of safe mode") if hdfs.path.exists(self.make_hdfs_test_path()): error_msg = "hdfs test path '%s' already exists. Please remove it" % self.make_hdfs_test_path() self.logger.fatal(error_msg) raise RuntimeError(error_msg) hdfs.mkdir(self.make_hdfs_test_path()) local_input = self.make_local_input_path() hdfs_input = self.make_hdfs_input_path() hdfs.put(local_input, hdfs_input) self.logger.info("Copied local input %s to %s", local_input, hdfs_input) self.logger.debug("Setup complete")
def capture(outpath, max_count='3'): """ fab cam.capture:/tmp/cam1,3 """ max_count = int(max_count) import os import cv2 import copy import pydoop.hdfs as hdfs cv2.namedWindow('Window1') vc = cv2.VideoCapture() vc.open(0) skip = 50 max_count *= skip basename = os.path.basename(outpath) count = 1 hdfs.mkdir('hdfs://gnn-f02-01' + outpath) while True: retval, image = vc.read() try: if count % skip == 0: tmpImage = copy.copy(image) filename = '%05d.jpg' % (count / skip) hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals() cv2.putText(tmpImage, filename, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, 2) cv2.imshow('Windows1', tmpImage) cv2.waitKey(1) cv2.imwrite(basename + '_' + filename, image) hdfs.put(basename + '_' + filename, hdfspath) print basename + '_' + filename, hdfspath else: cv2.imshow('Windows1', image) cv2.waitKey(1) except KeyboardInterrupt: break count += 1 if 0 < max_count < count: break vc.release() cv2.destroyWindow('Window1')
def execute(self, logger, env=None): """ Executes the command. This method calls self.command to build the command array and then executes the command. If provided, the specified `env` will be used. """ cmd = self.command(env) logger.debug("attempting to remove output path %s", self.output_str) try: phdfs.rmr(self.output_str) except IOError as e: logger.warning(e) if not phdfs.path.exists(phdfs.path.dirname(self.output_str)): phdfs.mkdir(phdfs.path.dirname(self.output_str)) logger.debug("Created parent of output directory") logger.info("Executing command: %s", cmd) logger.debug("PATH: %s", (env or os.environ).get('PATH')) subprocess.check_call(cmd, env=env)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rmr(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rmr(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data)
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ pipes_code = self.__generate_pipes_code() hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") hdfs.dump(pipes_code, self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") hdfs.put(self.args.module, self.remote_module) hdfs.chmod(self.remote_module, "a+r") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) self.logger.debug("Created remote paths:") self.logger.debug(self.remote_wd) self.logger.debug(self.remote_exe) self.logger.debug(self.remote_module)
def mkdir(hdfs_path, project=None): """ Create a directory and its parents as needed. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS). :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS. """ if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project, exists=False) return hdfs.mkdir(hdfs_path)
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ pipes_code = self.__generate_pipes_code() hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") hdfs.dump(pipes_code, self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") hdfs.put(self.args.module, self.remote_module) hdfs.chmod(self.remote_module, "a+r") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) self.logger.debug("Created remote paths:") self.logger.debug(self.remote_wd) self.logger.debug(self.remote_exe) self.logger.debug(self.remote_module)
def run_locally(model, input_dirs, output_dir, collate=False): hdfs.mkdir(output_dir) if collate: all_w, all_b = {}, {} for d in input_dirs: bn = hdfs.path.basename(d) weights, biases = get_all_wb(model, d) if collate: all_w.update({"%s_%s" % (d, t): w for (t, w) in weights.items()}) all_b.update({"%s_%s" % (d, t): b for (t, b) in biases.items()}) else: w_path = hdfs.path.join(output_dir, "%s_weights.npz" % bn) b_path = hdfs.path.join(output_dir, "%s_biases.npz" % bn) with hdfs.open(w_path, "wb") as f: np.savez(f, **weights) with hdfs.open(b_path, "wb") as f: np.savez(f, **biases) if collate: with hdfs.open(hdfs.path.join(output_dir, "weights.npz"), "wb") as f: np.savez(f, **all_w) with hdfs.open(hdfs.path.join(output_dir, "biases.npz"), "wb") as f: np.savez(f, **all_b)
def main(): # this is hdfs directory src_dir = str(sys.argv[1]) dst_dir = str(sys.argv[2]) # create dst_dir if not exist if not pyhdfs.path.exists(dst_dir): pyhdfs.mkdir(dst_dir) # create sparkcontext spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext # create children path rdd children_paths = pyhdfs.ls(src_dir) children_paths_rdd = sc.parallelize(children_paths, len(children_paths)) # each executor task is to copy one children path children_paths_rdd.foreach(lambda file_path: copy_file( file_path, os.path.join(dst_dir, os.path.basename(file_path)))) # stop sparkcontext sc.stop()
def run_task(factory, port=None, istream=None, ostream=None, private_encoding=True, context_class=TaskContext, cmd_file=None, fast_combiner=False, auto_serialize=True): """ Run the assigned task in the framework. :rtype: bool :return: :obj:`True` if the task succeeded. """ connections = resolve_connections( port, istream=istream, ostream=ostream, cmd_file=cmd_file, auto_serialize=auto_serialize ) context = context_class(connections.up_link, private_encoding=private_encoding, fast_combiner=fast_combiner) stream_runner = StreamRunner(factory, context, connections.cmd_stream) pstats_dir = os.getenv(PSTATS_DIR) if pstats_dir: pstats_fmt = os.getenv(PSTATS_FMT, DEFAULT_PSTATS_FMT) hdfs.mkdir(pstats_dir) fd, pstats_fn = tempfile.mkstemp(suffix=".pstats") os.close(fd) cProfile.runctx("stream_runner.run()", {"stream_runner": stream_runner}, globals(), filename=pstats_fn) name = pstats_fmt % ( "r" if context.is_reducer() else "m", context.get_task_partition(), os.path.basename(pstats_fn) ) hdfs.put(pstats_fn, hdfs.path.join(pstats_dir, name)) else: stream_runner.run() context.close() connections.close() return True
def hdfs_file(odir, name): """ Given HDFS dir and file name create appropriate dir structure on HDFS and return full path of the file. We rely on odir/YYYY/MM/DD dir structure. """ tstamp = name.split('/')[-1].split('_')[0] # each file is in form YYYYMMDD_HHMM.ext if not PAT_YYYYMMDD.match(tstamp): raise Exception("Given file name '%s' does not contain YYYYMMDD stamp" % name) year = tstamp[:4] if not PAT_YYYY.match(year): raise Exception("Given file name '%s' does not contain YYYY stamp" % name) month = tstamp[4:6] if not PAT_MM.match(month): raise Exception("Given file name '%s' does not contain MM stamp" % name) day = tstamp[6:8] if not PAT_DD.match(day): raise Exception("Given file name '%s' does not contain DD stamp" % name) if not hdfs.path.isdir(odir): hdfs.mkdir(odir) for subdir in [year, month, day]: odir = os.path.join(odir, subdir) if not hdfs.path.isdir(odir): hdfs.mkdir(odir) return os.path.join(odir, name)
def setUp(self): self.path = make_random_str() + UNI_CHR hdfs.mkdir(self.path)
city = sys.argv[2].lower() except IndexError: print "Error in application name (Uber/Lyft) or city name (SF/NYC)." sys.exit() if app not in {"uber", "lyft"} or city not in {"sf", "nyc"}: print "Error in application name (Uber/Lyft) or city name (SF/NYC)." sys.exit() # Get paths then delete old results and create new path. file_path = "hdfs://megatron.ccs.neu.edu/user/jiangshan/ridesharing/raw/" + app + "_" + city + "_raw_response" raw_measurement_path = "hdfs://megatron.ccs.neu.edu/user/jiangshan/ridesharing/proc/" + app + "_" + city + "_raw_measurement" user_info_path = "resources/" + city + "_user_info.txt" try: hdfs.rmr(raw_measurement_path) except: hdfs.mkdir(raw_measurement_path) # Get user info dictionary. user_info = get_user_info(user_info_path=user_info_path) # Start spark SQL session. spark = SparkSession.builder.appName( "raw_response_to_raw_measurement").getOrCreate() # Traverse each measurement location. for data_path in hdfs.ls(file_path): data_check = re.search("[A-Z][A-Z]-[0-9][0-9]-[0-9][0-9]", data_path) if not data_check: continue user = user_info[data_check.group()] # Load response data RDD.
def perform_copy(options): with open(options.src_pathset) as f: input_pathset = FilePathset.from_file(f) # set up workspace workspace = options.workspace log.info("Workspace set to %s", workspace) if not phdfs.path.exists(workspace): log.info("Workspace directory %s doesn't exist. Creating it.", workspace) phdfs.mkdir(workspace) src_paths = [ p for p in input_pathset ] log.debug("Source paths (first 5 or less): %s", src_paths[0:5]) # dest_path is a unique path under the workspace whose name should be the same # as the Galaxy dataset name. dest_path = phdfs.path.join(workspace, phdfs.path.basename(options.output_dataset)) log.info("Destination path: %s", dest_path) if phdfs.path.exists(dest_path): raise RuntimeError("Destination path %s already exists. Did you provide a valid Galaxy output dataset argument?" % dest_path) # We need to run a separate copy operation for each "leaf" destination # directory. E.g., # /tmp/dirA/file1 /tmp/dirA/file2 -> workspace/dirA/ # /tmp/dirB/file1 -> workspace/dirB/ # # As shown in the example, in general we cannot be sure the source paths have # unique basenames. We also cannot rename multiple files on-the-fly (to a new # name guaranteed to be unique, such as a uuid4). So, to reduce the number of # distcp or cp invocations, we group the source paths together by destination directory # (in the example, dirA and dirB). # expand for wildcards src_uris = [ u for wild in src_paths for u in expand_paths(urlparse(wild)) ] log.debug("first 5 src_uris: %s", src_uris[0:5]) destination_uris = [ src_to_dest_path(dest_path, u) for u in src_uris ] log.debug("first 5 destination_uris: %s", destination_uris[0:5]) copy_groups = _group_by_dest_dir(src_uris, destination_uris) if log.isEnabledFor(logging.DEBUG) and len(copy_groups) > 0: tpl = next(copy_groups.iteritems()) log.debug("one copy group:\n\tdest: %s\n\tsrc: %s", tpl[0], tpl[1]) try: if options.distcp: perform_distcp(copy_groups) else: perform_simple_cp(copy_groups) except Exception as e: log.critical("Failed to copy data to %s", dest_path) log.exception(e) log.info("Cleaning up %s, if it exists", dest_path) try: phdfs.rmr(dest_path) except IOError: log.debug("Failed to clean-up destination path %s. Maybe it was never created.", dest_path) raise e output_pathset = FilePathset(*copy_groups.iterkeys()) output_pathset.set_datatype(input_pathset.datatype) output_pathset.comment = "Copied from\n" + '\n'.join(src_paths) with open(options.output_dataset, 'w') as f: output_pathset.write(f)
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.DEBUG) with Timer() as total_time: parser = make_parser() args = parser.parse_args(argv) if args.dataset: print args.dataset create_dataset(logger, args.dataset) if args.script: piped_code_file = args.script else: piped_code_file = DEFAULT_SCRIPT if not os.path.exists(piped_code_file): raise IOError("script {0} not found !!!".format(piped_code_file)) with open(piped_code_file) as f: pipes_code = pts.add_sys_path(f.read()) dataset = [d for d in os.listdir("dataset") if d.endswith("MB")] dataset.sort(cmp=lambda x, y: cmp( int(x.replace("MB", "")), int(y.replace("MB", "")) )) logger.info(" Uploading dataset: { %s }", ', '.join(dataset)) if not hadut.path_exists(os.path.join(DATASET_DIR)): logger.info(" dataset folder created") hdfs.mkdir(DATASET_DIR) for data_filename in dataset: source_path = os.path.join(DATASET_DIR, data_filename) dest_path = os.path.join(DATASET_DIR, data_filename) if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)): logger.info(" -> uploading %s...", source_path) hdfs.put(source_path, dest_path) update_conf(args) results = dict() for data_input in dataset: with Timer() as t: runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) logger.info("Running the script %s with data input %s..", piped_code_file, data_input) data_input_path = os.path.join(DATASET_DIR, data_input) runner.set_input(data_input_path, put=False) runner.set_exe(pipes_code) runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger) res = runner.collect_output() print data_input_path local_wc = pts.LocalWordCount(data_input_path) logging.info(local_wc.check(res)) #print res #runner.clean() results[data_input] = (t.secs, t.msecs) print "\n\n RESULTs" print "=" * (len(piped_code_file) + 15) print " * script: {0}".format(piped_code_file) print " * mappers: {0}".format(CONF["mapred.map.tasks"]) print " * reducers: {0}".format(CONF["mapred.reduce.tasks"]) print " * dataset: [{0}]".format(",".join(dataset)) print " * times (input -> secs):" for data_input in dataset: print " - {0} -> {1} secs.".format( data_input, results[data_input][0] ) print "\n => Total execution time: {0}".format(total_time.secs) print "=" * (len(piped_code_file) + 15) print "\n"
import os import itertools import swiftclient import pydoop.hdfs as hdfs container = 'w251-enron' prefix = 'clean_v2' hdfs_prefix = '/enron' authurl = os.environ['SWIFT_AUTH_URL'] user = os.environ['SWIFT_USER'] key = os.environ['SWIFT_KEY'] conn = swiftclient.client.Connection( authurl=authurl, user=user, key=key) header, objects = conn.get_container(container, prefix=prefix, full_listing=True) hdfs.mkdir(hdfs_prefix) total = len(objects) count = 1 for obj in objects: name = obj['name'] print 'Downloading %s (%d of %d)' % (name, count, total) header, contents = conn.get_object(container, name) filename = name.replace('/', '_') hdfs.dump(contents, '%s/%s' % (hdfs_prefix, filename)) count += 1
def setUp(self): self.path = make_random_str() + UNI_CHR hdfs.mkdir(self.path)
def open_spider(self, spider): self.output_dir = spider.tmp_dir self.output_file = f"fg_{spider.html_format}.jsonlines" hdfs.mkdir(f"{self.output_dir}") self.f = hdfs.open(f"{self.output_dir}/{self.output_file}", "wt")
def create_remote_dir(remote_dir): hdfs.mkdir(remote_dir) logging.getLogger(__name__).debug("Creating remote directory {0}".format(remote_dir))
def _poly_mkdir(path, *args, **kwargs): if path.startswith('hdfs:'): return hdfs.mkdir(path, *args, **kwargs) else: return os.mkdir(path)