def good(self): base_path = make_random_str() for path in base_path, base_path + UNI_CHR: hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.exists(path)) hdfs.rmr(path) self.assertFalse(hdfs.path.exists(path))
def samefile_link(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) wd = 'file:%s' % wd_ link = os.path.join(wd_, make_random_str()) os.symlink(wd_, link) self.assertTrue(hdfs.path.samefile('file:%s' % link, 'file:%s' % wd_)) hdfs.rmr(wd)
def clean_empty_dirs(remote_basedir): LOGGER = logging.getLogger(__name__) deleted_dirs = [] ## Directory structure is {remote_basedir}/{year}/{month} year_dirs = hdfs.ls(remote_basedir) # Do an ls to find all month dirs for year_dir in year_dirs: month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir)) # Check to see if month dirs are empty month_dirs_deleted = 0 for month_dir in month_dirs: files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir, month_dir)) if not files: LOGGER.debug( "Directory {0} is empty, deleting it".format(month_dir)) hdfs.rmr(month_dir) deleted_dirs.append(month_dir) month_dirs_deleted += 1 if month_dirs_deleted == len(month_dirs): # Deleted all month sub-directories, so delete year directory too LOGGER.debug( "Directory {0} is empty, deleting it".format(year_dir)) hdfs.rmr(year_dir) deleted_dirs.append(year_dir) return deleted_dirs
def stat_on_local(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) p_ = os.path.join(wd_, make_random_str()) if hdfs.default_is_local(): wd, p = wd_, p_ host = "default" else: wd, p = ('file:%s' % _ for _ in (wd_, p_)) host = "" fs = hdfs.hdfs(host, 0) with fs.open_file(p_, 'w') as fo: fo.write(make_random_str()) info = fs.get_path_info(p_) fs.close() s = hdfs.path.stat(p) os_s = os.stat(p_) for n in dir(s): if n.startswith('st_'): try: exp_v = getattr(os_s, n) except AttributeError: try: exp_v = info[self.NMAP[n]] except KeyError: continue self.assertEqual(getattr(s, n), exp_v) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rmr(wd)
def run_mapred(model, input_dirs, output_dir, nmaps, log_level, collate=False): wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) if nmaps > len(input_dirs): nmaps = len(input_dirs) LOGGER.warn("Not enough input dirs, will only do %d splits" % nmaps) splits = common.balanced_split(input_dirs, nmaps) splits_uri = "pydoop_splits_%s" % uuid.uuid4().hex with hdfs.open(splits_uri, 'wb') as f: write_opaques([OpaqueInputSplit(1, _) for _ in splits], f) submitter = PydoopSubmitter() properties = { common.GRAPH_ARCH_KEY: model.name, common.LOG_LEVEL_KEY: log_level, common.NUM_MAPS_KEY: nmaps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_uri, } submitter.set_args( argparse.Namespace( D=list(properties.items()), avro_input=None, avro_output=None, cache_archive=None, cache_file=None, disable_property_name_conversion=True, do_not_use_java_record_reader=True, do_not_use_java_record_writer=True, entry_point="__main__", hadoop_conf=None, input=input_dirs[0], # does it matter? input_format=None, job_conf=None, job_name="dump_weights", keep_wd=False, libjars=None, log_level=log_level, module=os.path.splitext(os.path.basename(__file__))[0], no_override_env=False, no_override_home=False, no_override_ld_path=False, no_override_path=False, no_override_pypath=False, num_reducers=0, output=output_dir, output_format=None, pretend=False, pstats_dir=None, python_program=sys.executable, python_zip=[zip_fn], set_env=None, upload_archive_to_cache=None, upload_file_to_cache=[__file__], )) submitter.run() hdfs.rmr(splits_uri) if collate: collate_mapred_output(output_dir) shutil.rmtree(wd)
def _try_remove_hdfs_dir(path): try: phdfs.rmr(path) return True except StandardError as e: logger.error("Error while trying to remove directory %s", path) logger.exception(e) return False
def realpath(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) wd = 'file:%s' % wd_ link = os.path.join(wd_, make_random_str()) os.symlink(wd_, link) expected_path = 'file:%s' % os.path.realpath(wd_) self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path) hdfs.rmr(wd)
def __clean_wd(self): if self.remote_wd: try: self.logger.debug("Removing temporary working directory %s", self.remote_wd) hdfs.rmr(self.remote_wd) except IOError: pass
def remove_directory(self,hdfs_path): if (hdfs_path == ""): print "No directory specified to delete!" return False elif(self.file_exist(hdfs_path)==False): return False hdfs.rmr(hdfs_path) return True
def cleanup(out_pathset): # clean-up job output for path in out_pathset: try: print >> sys.stderr, "Deleting output path", path phdfs.rmr(path) except StandardError as e: print >> sys.stderr, "Error!", str(e)
def _clean_up(*paths): for p in paths: try: log.debug("Removing path: %s", p) phdfs.rmr(p) except StandardError as e: log.warning("Error deleting path %s", p) log.exception(e)
def runSparkNumASesInROAs(sc, ip_type): roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*" savePath = "/hdfs-to-local-path/rpki/results/roas-covering-AScnt-%s" % ip_type localPath = "/home/tjchung/research/rpki/src/spark/results/roas-covering-AScnt-%s" % ip_type try: hdfs.rmr(savePath) except: pass tals = [ "apnic", "apnic-iana", "apnic-afrinic", "apnic-arin", "apnic-lacnic", "apnic-ripe", "lacnic", "ripencc", "arin", "afrinic", "localcert" ] k = sc.textFile(roa_prefix_asn)\ .filter(lambda line: "#" not in line)\ .map(lambda line: line.rstrip().split("\t"))\ .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type))\ .distinct()\ .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time, tal), asID))\ .groupByKey()\ .map(lambda ( (time, tal), num_ases): (time, tal, len(set(num_ases)))) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(k, ['date', 'tal', 'num_ASes']) grouped = df.rdd\ .map(lambda row: (row.date, (row.tal, row.num_ASes)))\ .groupByKey() def make_row(kv): k, vs = kv # time: [(-1, cnt), (0, cnt), (1, cnt)] ... tmp = dict(list(vs) + [("date", k)]) return Row(**{k: tmp.get(k, 0) for k in ["date"] + tals}) reshaped = sqlContext.createDataFrame(grouped.map(make_row)) k = reshaped.rdd\ .map(lambda row: (row['date'], row["apnic"], row["apnic-iana"], row["apnic-afrinic"], row["apnic-arin"], row["apnic-lacnic"], row["apnic-ripe"], row["lacnic"], row["ripencc"], row["arin"], row["afrinic"], row["localcert"]))\ .map(toTSV) k.saveAsTextFile(savePath) try: shutil.rmtree(localPath) except: pass try: os.makedirs(localPath) except: pass hdfs.get(savePath, localPath) mergeAndSort(localPath)
def realpath(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) wd = 'file:%s' % wd_ link = os.path.join(wd_, make_random_str()) os.symlink(wd_, link) expected_path = ('file:%s%s' % ("/private", wd_) if sys.platform == "darwin" else 'file:%s' % wd_) self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path) hdfs.rmr(wd)
def __clean_wd(self): if self.remote_wd: try: self.logger.debug( "Removing temporary working directory %s", self.remote_wd ) hdfs.rmr(self.remote_wd) except IOError: pass
def delete_files(remote_basedir, retention): inodes = walk_remotely(remote_basedir) now = time.time() deleted_files = [] for inode in inodes: if now - inode['last_mod'] > retention and inode['kind'] == 'file': LOGGER.debug("Deleting file {0}".format(inode['path'])) hdfs.rmr(inode['path']) deleted_files.append(inode['path']) return deleted_files
def runTest(self): path = make_random_str() + UNI_CHR hdfs.dump("foo\n", path) st = hdfs.path.stat(path) atime, mtime = [getattr(st, 'st_%stime' % _) for _ in 'am'] new_atime, new_mtime = atime + 100, mtime + 200 hdfs.path.utime(path, (new_atime, new_mtime)) st = hdfs.path.stat(path) self.assertEqual(st.st_atime, new_atime) self.assertEqual(st.st_mtime, new_mtime) hdfs.rmr(path)
def runSparkNumPrefixWithMaxlen(sc, ip_type="ipv4"): roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*" localPath = "/home/tjchung/research/rpki/src/spark/results/roa-prefix-with-maxlength" savePath = "/hdfs-to-local-path/rpki/results/roa-prefix-with-maxlength" try: hdfs.rmr(savePath) except: pass k = sc.textFile(roa_prefix_asn)\ .filter(lambda line: "#" not in line)\ .map(lambda line: line.rstrip().split("\t"))\ .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type) )\ .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal))\ .distinct()\ .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time, str(int( (prefix_len != maxlen) and maxlen != "None" ))), 1))\ .reduceByKey(lambda a, b: a+ b)\ .map(lambda ((time, hasMaxlen), cnt): (time, hasMaxlen, cnt)) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(k, ['date', 'hasMaxlen', 'cnt']) grouped = df.rdd\ .map(lambda row: (row.date, (row.hasMaxlen, row.cnt)))\ .groupByKey() def make_row(kv): k, vs = kv tmp = dict(list(vs) + [("date", k)]) return Row(**{k: tmp.get(k, 0) for k in ["date", "0", "1"]}) # 1 means has a maxlen reshaped = sqlContext.createDataFrame(grouped.map(make_row)) k = reshaped.rdd\ .map(lambda row: (row['date'], row['0'], row['1']))\ .map(toTSV) k.saveAsTextFile(savePath) try: shutil.rmtree(localPath) except: pass try: os.makedirs(localPath) except: pass hdfs.get(savePath, localPath) mergeAndSort(localPath)
def _tear_down_flink_session(app_id): if not app_id: raise ValueError("_tear_down_flink_session: empty app id!") cmd = ['yarn', 'application', '-kill', app_id] logger.info("Killing flink session with app id '%s'", app_id) logger.debug("Command: %s", cmd) subprocess.check_call(cmd) # clean up temporary yarn session files, if any path = ".flink/" + app_id if phdfs.path.exists(path): logger.info("Also removing the session's temporary files in %s", path) phdfs.rmr(path)
def _tear_down_flink_session(app_id): if not app_id: raise ValueError("_tear_down_flink_session: empty app id!") cmd = [ 'yarn', 'application', '-kill', app_id ] logger.info("Killing flink session with app id '%s'", app_id) logger.debug("Command: %s", cmd) subprocess.check_call(cmd) # clean up temporary yarn session files, if any path = ".flink/" + app_id if phdfs.path.exists(path): logger.info("Also removing the session's temporary files in %s", path) phdfs.rmr(path)
def execute(self): """ Execute workflow in dedicated directory """ hdfs_output_dir = "workflow_output_{}".format(time.time()) logger.debug("Setting up workflow") logger.debug("CWD: %s", os.getcwd()) logger.debug("workflow output directory: %s", hdfs_output_dir) cmd = [self._program] + [str(arg) for arg in self._args] cmd.append(self._input_dir) cmd.append(hdfs_output_dir) logger.debug("workflow command: %s", cmd) wf_logfile = os.path.abspath(GlobalConf['workflow_logfile']) logger.info("Executing worflow") logger.info("Writing workflow log to %s", wf_logfile) self._clear_caches() try: with open(wf_logfile, 'a') as f: logger.info("Starting workflow") start_time = time.time() retcode = subprocess.call(cmd, stdout=f, stderr=subprocess.STDOUT) end_time = time.time() run_time = end_time - start_time attempt_info = AttemptInfo(cmd, retcode, wf_logfile, run_time) if retcode == 0: logger.info("Workflow finished") logger.info("Attempt took %0.2f seconds", run_time) bcl, align = self._get_part_times_from_log(wf_logfile) attempt_info.bcl_secs = bcl attempt_info.align_secs = align else: logger.info("Workflow FAILED with exit code %s", retcode) return attempt_info finally: try: if phdfs.path.exists(hdfs_output_dir): logger.debug( "Removing workflow's temporary output directory %s", hdfs_output_dir) phdfs.rmr(hdfs_output_dir) except StandardError as e: logger.error( "Failed to clean up workflow's output directory %s", hdfs_output_dir) logger.exception(e)
def test_isdir(self): for path in self.path, self.u_path: self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_isdir(self): path = utils.make_random_str() self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): path = utils.make_random_str() self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): for path in self.path, self.u_path: self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.num_reducers = 0 if args.seed: LOGGER.info("setting random seed to %d", args.seed) random.seed(args.seed) model = models.get_model_info(args.architecture) graph = model.load_prep() bneck_tensor = model.get_bottleneck(graph) bneck_store = ioformats.BottleneckStore( bneck_tensor.shape[1].value, bneck_tensor.dtype ) bneck_map = bneck_store.build_map(args.input) LOGGER.info("%d subdirs, %r bottlenecks" % (len(bneck_map), [len(_) for _ in bneck_map.values()])) splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex) generate_input_splits(args.num_maps, bneck_map, splits_path) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.BNECKS_DIR_KEY: args.input, common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval, common.GRAPH_ARCH_KEY: args.architecture, common.LEARNING_RATE_KEY: args.learning_rate, common.LOG_LEVEL_KEY: args.log_level, common.NUM_MAPS_KEY: args.num_maps, common.NUM_STEPS_KEY: args.num_steps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path, common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size, common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size, common.VALIDATION_PERCENT_KEY: args.validation_percent, }) if args.seed: submitter.properties[common.SEED_KEY] = args.seed submitter.run() hdfs.rmr(splits_path) shutil.rmtree(wd)
def upsert_a_folder(src_dir, hdfs_tgt_dir, filename, debug): src_fname = os.path.join(src_dir, filename) tgt_fname = os.path.join(hdfs_tgt_dir, filename) # get target file info tgt_dict = {} try: lsl = hdfs.lsl(hdfs_tgt_dir) for i in lsl: try: tgt_dict[os.path.basename(i["name"])] = i["last_mod"] except: pass except: pass print "hdfs tgt_dict=", tgt_dict # get source info src_fs = glob.glob(src_fname) print "src_fs=", src_fs for sf in src_fs: # get source file info try: src_ctime_int = int(os.path.getctime(sf)) except: src_ctime_int = None print "src_ctime_int=", src_ctime_int src_bfname = os.path.basename(sf) tgt_fname = os.path.join(hdfs_tgt_dir, src_bfname) # put or rm/put try: if not src_bfname in tgt_dict: #insert new one if debug == 'N': hdfs.put(sf, hdfs_tgt_dir) else: print "DEBUG: put ", src_bfname, "to", hdfs_tgt_dir elif src_ctime_int > tgt_dict[src_bfname]: if debug == 'N': hdfs.rmr(tgt_fname) hdfs.put(sf, hdfs_tgt_dir) else: print "DEBUG: replace ", tgt_fname, "by", sf else: print tgt_fname, "has a newer mdate than", sf, ":", src_ctime_int except: e = sys.exc_info()[0] print "Error: ", e
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.INFO) local_input = argv[1] with open(MR_SCRIPT) as f: pipes_code = pts.add_sys_path(f.read()) runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) runner.set_input(local_input, put=True) runner.set_exe(pipes_code) runner.run() res = runner.collect_output() runner.clean() hdfs.rmr(HDFS_WD) logger.info("checking results") expected_res = local_vc(local_input) logger.info(check(res, expected_res))
def clean_directory(dir, spam_life=spam_ttl): # Accepts a directory name and deletes anything older than TTL in days file_list = [] # check the existance of the directory if hdfs.path.exists(dir): # get a list of all files there file_list = hdfs.lsl(dir) # loop through the file list for listing in file_list: # get the last access time of the file and compare to spam lifetime if time.time() - listing[ 'last_access'] > 86400 * spam_life: # 86400 seconds in a day # if its too old delete it and log that it was deleted logger.info('Deleting ' + listing['name']) hdfs.rmr(listing['name'])
def execute(self): """ Execute workflow in dedicated directory """ hdfs_output_dir = "workflow_output_{}".format(time.time()) logger.debug("Setting up workflow") logger.debug("CWD: %s", os.getcwd()) logger.debug("workflow output directory: %s", hdfs_output_dir) cmd = [ self._program ] + [ str(arg) for arg in self._args ] cmd.append(self._input_dir) cmd.append(hdfs_output_dir) logger.debug("workflow command: %s", cmd) wf_logfile = os.path.abspath(GlobalConf['workflow_logfile']) logger.info("Executing worflow") logger.info("Writing workflow log to %s", wf_logfile) self._clear_caches() try: with open(wf_logfile, 'a') as f: logger.info("Starting workflow") start_time = time.time() retcode = subprocess.call(cmd, stdout=f, stderr=subprocess.STDOUT) end_time = time.time() run_time = end_time - start_time attempt_info = AttemptInfo(cmd, retcode, wf_logfile, run_time) if retcode == 0: logger.info("Workflow finished") logger.info("Attempt took %0.2f seconds", run_time) bcl, align = self._get_part_times_from_log(wf_logfile) attempt_info.bcl_secs = bcl attempt_info.align_secs = align else: logger.info("Workflow FAILED with exit code %s", retcode) return attempt_info finally: try: if phdfs.path.exists(hdfs_output_dir): logger.debug("Removing workflow's temporary output directory %s", hdfs_output_dir) phdfs.rmr(hdfs_output_dir) except StandardError as e: logger.error("Failed to clean up workflow's output directory %s", hdfs_output_dir) logger.exception(e)
def main(args): logger.setLevel(logging.DEBUG) options = parse_args(args) logger.setLevel(options.log_level) logger.info("Running workflow with the following configuration") logger.info("n_nodes: %d", options.n_nodes) logger.info("bcl converter jar %s", options.jar_path) logger.info("Other conf:\n%s", GlobalConf) start_time = time.time() try: if options.skip_bcl: logger.info("Skipping bcl conversion as requested") tmp_output_dir = options.input else: tmp_output_dir = mk_hdfs_temp_dir('bcl_output_') logger.debug("Temporary output directory on HDFS: %s", tmp_output_dir) run_bcl_converter(options.input, tmp_output_dir, options.n_nodes, options.jar_path) time_after_bcl = time.time() run_alignments(tmp_output_dir, options.output) time_after_align = time.time() finally: if options.keep_intermediate: logger.info("Leaving intermediate data in directory %s", tmp_output_dir) elif not options.skip_bcl: # if we skipped bcl, tmp_conf_dir is the input directory try: phdfs.rmr(tmp_output_dir) except StandardError as e: logger.error( "Error while trying to remove temporary output directory %s", tmp_output_dir) logger.exception(e) finish_time = time.time() logger.info("Seconds for bcl conversion: %0.2f", (time_after_bcl - start_time)) logger.info("Seconds for alignment: %0.2f", (time_after_align - time_after_bcl)) logger.info("Total execution time: %0.2f", (finish_time - start_time))
def setUp(self): import gzip import shutil try: os.utime(landing_zone + '/badfile.txt', None) except OSError: f = open(landing_zone + '/badfile.txt', 'a') for x in range(0, 10): f.write('line ' + str(x) + '\n') f.close() with open(landing_zone + '/badfile.txt', 'rb') as f_in, \ gzip.open(landing_zone + '/badfile.txt.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) try: os.utime(landing_zone + '/badfile.txt', None) except OSError: open(landing_zone + '/badfile.txt', 'a').close() try: os.utime(landing_zone + '/sample.txt', None) except OSError: open(landing_zone + '/sample.txt', 'a').close() try: os.utime(landing_zone + '/sandbox.txt', None) except OSError: open(landing_zone + '/sandbox.txt', 'a').close() try: hdfs.rmr(hdfs.path.expanduser("~") + '/data/none/test') except IOError: pass try: if hdfs.path.exists( hdfs.path.expanduser("~") + '/data/duplicate/'): hdfs.rmr(hdfs.path.expanduser("~") + '/data/duplicate/') except IOError: pass try: if hdfs.path.exists(hdfs.path.expanduser("~") + '/data/spam/'): hdfs.rmr(hdfs.path.expanduser("~") + '/data/spam/') except IOError: pass try: if hdfs.path.exists( hdfs.path.expanduser("~") + '/data/sandbox/bria644/sandboxFile/sandbox.txt'): hdfs.rmr( hdfs.path.expanduser("~") + '/data/sandbox/bria644/sandboxFile/sandbox.txt') except IOError: pass
def stat(self): if hdfs.default_is_local(): return bn = '%s%s' % (make_random_str(), UNI_CHR) fn = '/user/%s/%s' % (DEFAULT_USER, bn) fs = hdfs.hdfs("default", 0) p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn) with fs.open_file(fn, 'w') as fo: fo.write(make_random_str()) info = fs.get_path_info(fn) fs.close() s = hdfs.path.stat(p) for n1, n2 in self.NMAP.iteritems(): attr = getattr(s, n1, None) self.assertFalse(attr is None) self.assertEqual(attr, info[n2]) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rmr(p)
def run(self): exit_code = 1 with tempfile.NamedTemporaryFile() as f: self.log.debug("opened scratch MR job input file %s", f.name) # We write the files to be compressed to a temporary file. Later we'll re-read # this temporary file to rename the files as well. I've opted not to keep the # table in memory in the hope of scaling better to jobs with a large number of # files (we reduce memory requirements). num_files = self.__write_mr_input(f) f.flush() self.log.debug("Finished writing temp input file") input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_txt_zipper_input") tmpfile_uri = "file://%s" % f.name try: self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename) hdfs.cp(tmpfile_uri, input_filename) self.log.info("Run analyzed. Launching distributed job") # launch mr task pydoop_args = \ [ 'script', '--num-reducers', '0','--kv-separator', '', '-Dmapred.map.tasks=%d' % num_files, '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat', '-Dmapred.line.input.format.linespermap=1', '-Dmapred.output.compress=true', '-Dmapred.output.compression.codec=%s' % 'org.apache.hadoop.io.compress.GzipCodec', text_zipper_mr.__file__, input_filename, self.output_path] self.log.debug("pydoop_args: %s", pydoop_args) self.log.info("Compressing %s files", num_files) pydoop_app.main(pydoop_args) self.log.info("Distributed job complete") self.rename_compressed_files(f) self.log.info("finished") exit_code = 0 finally: try: self.log.debug("Removing temporary input file %s", input_filename) hdfs.rmr(input_filename) except IOError as e: self.log.warning("Problem cleaning up. Error deleting temporary input file %s", input_filename) self.log.exception(str(e)) return exit_code
def execute(self, logger, env=None): """ Executes the command. This method calls self.command to build the command array and then executes the command. If provided, the specified `env` will be used. """ cmd = self.command(env) logger.debug("attempting to remove output path %s", self.output_str) try: phdfs.rmr(self.output_str) except IOError as e: logger.warning(e) if not phdfs.path.exists(phdfs.path.dirname(self.output_str)): phdfs.mkdir(phdfs.path.dirname(self.output_str)) logger.debug("Created parent of output directory") logger.info("Executing command: %s", cmd) logger.debug("PATH: %s", (env or os.environ).get('PATH')) subprocess.check_call(cmd, env=env)
def clean_up(self): if sys.argv.count('--no-cleanup') > 0: self.logger.warn("User specified --no-cleanup. Not deleting temporary files") self.logger.warn("output dir: %s", self.output_dir) self.logger.warn("hdfs input path: %s", self.make_hdfs_input_path()) self.logger.warn("hdfs output path: %s", self.make_hdfs_output_path()) self.logger.warn("hdfs test path: %s", self.make_hdfs_test_path()) return self.rm_output_dir() try: hdfs.rmr(self.make_hdfs_input_path()) except Exception as e: self.logger.warning(e) pass try: hdfs.rmr(self.make_hdfs_output_path()) except Exception as e: self.logger.warning(e) pass try: hdfs.rmr(self.make_hdfs_test_path()) except Exception as e: self.logger.warning(e) pass
def runSparkClassifyHijackingUniquePrefixDuration(sc, dataset, ip_type): readPath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-%s/%s" % ( ip_type, dataset) savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-classify-hijack-duration-%s/%s" % ( ip_type, dataset) localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-unique-prefix-classify-hijack-duration-%s/%s" % ( ip_type, dataset) try: hdfs.rmr(savePath) except: pass k = sc.textFile(readPath)\ .map(lambda v: parseVerifyLineUniquePrefix(v))\ .filter(lambda v: v is not None)\ .filter(lambda v: notDataError(dataset, v))\ .filter(lambda v: isIPv4v6(v, ip_type))\ .filter(lambda v: classifyBGPAdvSparse(v) == "rpki-invalid")\ .filter(lambda v: ip_type == "ipv6" or not isLargerSlash24(v))\ .filter(lambda v: onlyHijackAttempt(v))\ .map(lambda v: ( (classifyHijack(v), v['prefix_addr'], v['prefix_len'], v['origin_as']), v['time']))\ .groupByKey()\ .map(lambda ((classifyHijack, prefix_addr, prefix_len, origin), list_of_time): (classifyHijack, prefix_addr, prefix_len, origin, len(set(list_of_time))))\ .map(toTSV)\ .saveAsTextFile(savePath) try: shutil.rmtree(localPath) except: pass try: os.makedirs(localPath) except: pass hdfs.get(savePath, localPath) mergeAndSort(localPath)
def rmr(hdfs_path, project=None): """ Recursively remove files and directories. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS). :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS. """ if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project) return hdfs.rmr(hdfs_path)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rmr(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data)
def main(args): logger.setLevel(logging.DEBUG) options = parse_args(args) logger.setLevel(options.log_level) logger.info("Running workflow with the following configuration") logger.info("n_nodes: %d", options.n_nodes) logger.info("bcl converter jar %s", options.jar_path) logger.info("Other conf:\n%s", GlobalConf) start_time = time.time() try: if options.skip_bcl: logger.info("Skipping bcl conversion as requested") tmp_output_dir = options.input else: tmp_output_dir = mk_hdfs_temp_dir('bcl_output_') logger.debug("Temporary output directory on HDFS: %s", tmp_output_dir) run_bcl_converter(options.input, tmp_output_dir, options.n_nodes, options.jar_path) time_after_bcl = time.time() run_alignments(tmp_output_dir, options.output) time_after_align = time.time() finally: if options.keep_intermediate: logger.info("Leaving intermediate data in directory %s", tmp_output_dir) elif not options.skip_bcl: # if we skipped bcl, tmp_conf_dir is the input directory try: phdfs.rmr(tmp_output_dir) except StandardError as e: logger.error("Error while trying to remove temporary output directory %s", tmp_output_dir) logger.exception(e) finish_time = time.time() logger.info("Seconds for bcl conversion: %0.2f", (time_after_bcl - start_time)) logger.info("Seconds for alignment: %0.2f", (time_after_align - time_after_bcl)) logger.info("Total execution time: %0.2f", (finish_time - start_time))
def run(self): pydoop_exec = self.find_exec('pydoop') if pydoop_exec is None: raise RuntimeError("Can't find pydoop executable in PATH") with tempfile.NamedTemporaryFile() as f: num_records = self.__write_mr_input(f) f.flush() self.log.debug("Wrote temp input file %s", f.name) input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_bcl2qseq_input") tmpfile_uri = "file://%s" % f.name try: self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename) hdfs.cp(tmpfile_uri, input_filename) self.log.info("Run analyzed. Launching distributed job") # launch mr task cmd = [ 'pydoop', 'script', '--num-reducers', '0', '--kv-separator', '', '-Dmapred.map.tasks=%d' % num_records, '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat', '-Dmapred.line.input.format.linespermap=1', bcl2qseq_mr.__file__, input_filename, self.output_path] self.log.debug(str(cmd)) subprocess.check_call(cmd) self.log.info("Distributed job complete") except subprocess.CalledProcessError as e: self.log.exception(e) self.log.error("Error running pydoop script component") raise finally: try: hdfs.rmr(input_filename) except IOError as e: self.log.debug("Problem cleaning up. Error deleting temporary input file %s", input_filename) self.log.debug(str(e))
def clean_empty_dirs(remote_basedir): LOGGER = logging.getLogger(__name__) deleted_dirs = [] ## Directory structure is {remote_basedir}/{year}/{month} year_dirs = hdfs.ls(remote_basedir) # Do an ls to find all month dirs for year_dir in year_dirs: month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir)) # Check to see if month dirs are empty month_dirs_deleted = 0 for month_dir in month_dirs: files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir, month_dir)) if not files: LOGGER.debug("Directory {0} is empty, deleting it".format(month_dir)) hdfs.rmr(month_dir) deleted_dirs.append(month_dir) month_dirs_deleted += 1 if month_dirs_deleted == len(month_dirs): # Deleted all month sub-directories, so delete year directory too LOGGER.debug("Directory {0} is empty, deleting it".format(year_dir)) hdfs.rmr(year_dir) deleted_dirs.append(year_dir) return deleted_dirs
def clean(self): """ Remove the working directory, if any. """ if self.wd: hdfs.rmr(self.wd)
def samefile_rel(self): p = make_random_str() + UNI_CHR hdfs.dump("foo\n", p) self.assertTrue(hdfs.path.samefile(p, hdfs.path.abspath(p))) hdfs.rmr(p)
def tearDown(self): hdfs.rmr(self.path)
def good(self): path = utils.make_random_str() hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.exists(path)) hdfs.rmr(path) self.assertFalse(hdfs.path.exists(path))