def clean_empty_dirs(remote_basedir): LOGGER = logging.getLogger(__name__) deleted_dirs = [] ## Directory structure is {remote_basedir}/{year}/{month} year_dirs = hdfs.ls(remote_basedir) # Do an ls to find all month dirs for year_dir in year_dirs: month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir)) # Check to see if month dirs are empty month_dirs_deleted = 0 for month_dir in month_dirs: files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir, month_dir)) if not files: LOGGER.debug( "Directory {0} is empty, deleting it".format(month_dir)) hdfs.rmr(month_dir) deleted_dirs.append(month_dir) month_dirs_deleted += 1 if month_dirs_deleted == len(month_dirs): # Deleted all month sub-directories, so delete year directory too LOGGER.debug( "Directory {0} is empty, deleting it".format(year_dir)) hdfs.rmr(year_dir) deleted_dirs.append(year_dir) return deleted_dirs
def move(self): for wd in self.local_wd, self.hdfs_wd: t1 = self.__make_tree(wd) t2 = [_ for _ in t1.children if _.kind == 1][0] f2 = t2.children[0] hdfs.move(f2.name, t1.name) ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)] self.assertTrue(os.path.basename(f2.name) in ls) self.assertEqual(len(hdfs.ls(t2.name)), 0)
def get_default_fs(): root_ls = phdfs.ls('/') if root_ls: uri = Uri(urlparse.urlparse(root_ls[0])) return uri else: raise RuntimeError("Could not determine URI of default file system. It's empty.")
def server(): print 'Server...' context = zmq.Context() socket = context.socket(zmq.REP) socket.bind(server_tcp) client_list = [] hdfs_url = '/datasets/corpus/enwiki-11g' file_list = hdfs.ls(hdfs_url) print len(file_list) while True: message = socket.recv() if message.startswith("connect"): client_list.append(message.split(':')[1]) socket.send("connected") elif message.startswith("read"): client = message.split(':')[1] print client if len(file_list) == 0: socket.send("done") client_list.remove(client) if len(client_list) == 0: return if client in client_list: f = file_list.pop() print len(file_list) print f socket.send_string("file>" + f)
def get_all_wb(model, checkpoint_dir): """\ Get all weights and biases from model checkpoints in checkpoint_dir. checkpoint_dir: part-m-00000.zip part-m-00001.zip ... return: {"00000": W0, "00001": W1, ...}, {"00000": b0, "00001": b1, ...} """ paths = [] tags = {} for p in hdfs.ls(checkpoint_dir): m = re.match(r"^part-m-(\d+)\.zip$", hdfs.path.basename(p)) if m: paths.append(p) tags[p] = m.groups()[0] weights, biases = {}, {} for p in paths: t = tags[p] weights[t], biases[t] = get_wb(model, p) LOGGER.info("%s: W %r b %r", p, weights[t].shape, biases[t].shape) return weights, biases
def _get_hopsfs_dataset_files(training_dataset_location, split, filter_empty): path = training_dataset_location.replace("hopsfs", "hdfs") if split is None: path = hdfs.path.abspath(path) else: path = hdfs.path.abspath(path + "/" + str(split)) input_files = [] all_list = hdfs.ls(path, recursive=True) # Remove directories and spark '_SUCCESS' include_file = True for file in all_list: # remove empty file if any if filter_empty: _file_size = hdfs.path.getsize(file) if _file_size == 0: include_file = False else: include_file = True if (not hdfs.path.isdir(file) and not file.endswith("_SUCCESS") and include_file): input_files.append(file) return input_files
def evaluate(): prefix = 'frozen_graph' # freeze_graph("prob") time0 = time.time() graph = load_frozen_graph(prefix=prefix) with tf.Session(graph=graph) as sess: sess.run(graph.get_operation_by_name(prefix + '/init_all_tables')) empty_X = {'feat_ids': [], 'feat_vals': []} label_list, hdfs_files = [], [] for xfile in hdfs.ls(hdfs_dir): if hdfs.path.isdir(xfile): continue hdfs_files.append(xfile) pred_fp = open('pred', 'w') label_fp = open('label', 'w') print("Begin inference") for i in range(190, len(hdfs_files)): train_fp = hdfs.open(hdfs_files[i], 'rt') end_of_file = False while True: X_validate = copy.deepcopy(empty_X) read_line_num = 0 while True: line = train_fp.readline().strip().split(' ') if len(line) != len(_COLUMNS): end_of_file = True break X_validate['feat_ids'].append(list(map(lambda x: [int(x.split(':')[0])], line[1:]))) X_validate['feat_vals'].append(list(map(lambda x: [float(x.split(':')[1])], line[1:]))) label_list.append(line[0]) read_line_num += 1 if read_line_num == FLAGS.batch_size: break input_feed = dict() input_feed[sess.graph.get_tensor_by_name(prefix + "/" + 'IteratorGetNext:0')] = X_validate['feat_ids'] input_feed[sess.graph.get_tensor_by_name(prefix + "/" + 'IteratorGetNext:1')] = X_validate['feat_vals'] prob = graph.get_operation_by_name(prefix + "/prob").outputs[-1] pred = sess.run(prob, feed_dict=input_feed) np.savetxt(pred_fp, pred, delimiter='\n', fmt='%s') if end_of_file: break label_fp.writelines('\n'.join([str(x) for x in label_list]) + '\n') train_fp.close() pred_fp.close() label_fp.close() os.system("paste -d '\t' pred label > prob_label") os.system("python evaluate.py prob_label") time1 = time.time() print("evaluate cost: ", time1 - time0)
def mkdir(self): for wd in self.local_wd, self.hdfs_wd: d1 = "%s/d1" % wd d2 = "%s/d2" % d1 hdfs.mkdir(d2) dir_list = hdfs.ls(d1) self.assertEqual(len(dir_list), 1) self.assertTrue(dir_list[0].endswith(d2))
def list_images(input_dir): ret = [] p = re.compile(r".*\.jpe?g$", re.IGNORECASE) ls = [_['name'] for _ in hdfs.lsl(input_dir) if _['kind'] == 'directory'] for d in ls: ret.extend([_ for _ in hdfs.ls(d) if p.match(_)]) LOGGER.info("%d classes, %d total images", len(ls), len(ret)) return ret
def read_csv_from_hdfs(path, cols, col_types=None): files = hdfs.ls(path); pieces = [] for f in files: fhandle = hdfs.open(f) pieces.append(pd.read_csv(fhandle, names=cols, dtype=col_types)) fhandle.close() return pd.concat(pieces, ignore_index=True)
def json_from_hdfs(url): assert hdfs.path.isdir(url) file_lists = hdfs.ls(url) for fi in file_lists: with hdfs.open(fi, "r") as f: items = f.read().strip().split('\n') for it in items: it = loads(it) it['md5'] = hashlib.md5(str(it)).hexdigest() yield it
def check(self, args): self.root = args["root"] self.hdfs_root = args["hdfs_root"] print("checking: %s" % self.root) print("checking hdfs: %s" % self.hdfs_root) if path.isdir(self.hdfs_root) == False: return False if os.path.isdir(self.root) == False: return False return self.walk(self.root, os.listdir(self.root), self.hdfs_root, hdfs.ls(self.hdfs_root))
def __init__(self, uri, compress=True): "ctor with hdfs uri: hdfsio:/path/schema.avsc" Storage.__init__(self, uri) schema = self.uri if not hdfs.ls(schema): raise Exception("No avro schema file found in provided uri: %s" % uri) self.hdir = self.uri.rsplit('/', 1)[0] if not hdfs.path.isdir(self.hdir): raise Exception('HDFS path %s does not exists' % self.hdir) schema_doc = hdfs.load(schema) self.schema = avro.schema.parse(schema_doc) self.compress = compress
def ls(hdfs_path, recursive=False): """ lists a directory in HDFS Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). Returns: returns a list of hdfs paths """ hdfs_path = _expand_path(hdfs_path) return hdfs.ls(hdfs_path, recursive=recursive)
def read_all_data(file_path='/home/ad/wujindou/text_0908'): valid_path = ['part-'+filename.split('part-')[1] for filename in hdfs.ls(file_path) if 'part' in filename ] data_all = [] uniq = set() for filename in valid_path: with hdfs.open(file_path+'/'+filename) as f: for line in f: if line.decode() in uniq:continue uniq.add(line.decode) data_all.append(line.decode()) import random random.shuffle(data_all) return data_all
def read(self, storage_connector, data_format, read_options, location, split): df_list = [] if storage_connector.connector_type == storage_connector.HOPSFS: # providing more informative error try: from pydoop import hdfs except ImportError as err: raise ModuleNotFoundError( "Reading training dataset from HopsFS requires `pydoop`" ) from err util.setup_pydoop() if split is None: path_list = hdfs.ls(location, recursive=True) else: path_list = hdfs.ls(location + "/" + str(split), recursive=True) for path in path_list: if (hdfs.path.isfile(path) and not path.endswith("_SUCCESS") and hdfs.path.getsize(path) > 0): if data_format.lower() == "csv": df_tmp = pd.read_csv(path) elif data_format.lower() == "tsv": df_tmp = pd.read_csv(path, sep="\t") elif data_format.lower() == "parquet": df_tmp = pd.read_parquet(path) else: raise TypeError( "{} training dataset format is not supported to read as pandas dataframe. If you are using `tfrecord` use the `.tf_data` helper functions." .format(data_format)) df_list.append(df_tmp) else: raise NotImplementedError( "{} Storage Connectors for training datasets are not supported yet for external environments." .format(storage_connector.connector_type)) return pd.concat(df_list, ignore_index=True)
def ls(hdfs_path, recursive=False, exclude_nn_addr=False): """ lists a directory in HDFS Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). Returns: returns a list of hdfs paths """ if exclude_nn_addr: hdfs_path = re.sub(r"\d+.\d+.\d+.\d+:\d+", "", hdfs_path) hdfs_path = _expand_path(hdfs_path) return hdfs.ls(hdfs_path, recursive=recursive)
def clean_empty_dirs(remote_basedir): LOGGER = logging.getLogger(__name__) deleted_dirs = [] ## Directory structure is {remote_basedir}/{year}/{month} year_dirs = hdfs.ls(remote_basedir) # Do an ls to find all month dirs for year_dir in year_dirs: month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir)) # Check to see if month dirs are empty month_dirs_deleted = 0 for month_dir in month_dirs: files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir, month_dir)) if not files: LOGGER.debug("Directory {0} is empty, deleting it".format(month_dir)) hdfs.rmr(month_dir) deleted_dirs.append(month_dir) month_dirs_deleted += 1 if month_dirs_deleted == len(month_dirs): # Deleted all month sub-directories, so delete year directory too LOGGER.debug("Directory {0} is empty, deleting it".format(year_dir)) hdfs.rmr(year_dir) deleted_dirs.append(year_dir) return deleted_dirs
def __init__(self, uri, wmauri, yarn=''): "ctor with LTS uri (hdfs:///path/schema.avsc) and WMArchive uri" self.uri = uri if not hdfs.ls(self.uri): raise Exception("No avro schema file found in provided uri: %s" % uri) self.hdir = self.uri.rsplit('/', 1)[0] if not hdfs.path.isdir(self.hdir): raise Exception('HDFS path %s does not exists' % self.hdir) schema_doc = hdfs.load(self.uri) self.schema = avro.schema.parse(schema_doc) self.taskmgr = TaskManager() self.wmauri = wmauri # WMArchive URL which will be used by submit if not self.wmauri.endswith('/wmarchive/data'): self.wmauri = '%s/wmarchive/data' % self.wmauri self.yarn = yarn
def ls(hdfs_path, recursive=False, project=None): """ Returns all the pathnames in the supplied directory. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS). :recursive: if it is a directory and recursive is True, the list contains one item for every file or directory in the tree rooted at hdfs_path. :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS. Returns: A possibly-empty list of path names stored in the supplied path. """ if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project) return hdfs.ls(hdfs_path, recursive=recursive)
def run_mr_app(args, logger): logger.debug("local LIBHDFS_OPTS: %r" % (os.getenv("LIBHDFS_OPTS"), )) logger.info("running MapReduce application") mr_out_dir = run_phase_one(args, logger) for nm in args.mappers[1:]: input_ = random_str(args) logger.info("running consolidation step, input=%r" % (input_, )) with hdfs.open(input_, "w", user=args.hdfs_user) as fo: ls = [ _ for _ in hdfs.ls(mr_out_dir, user=args.hdfs_user) if hdfs.path.basename(_).startswith("part") ] logger.debug("found %d data files in %r" % (len(ls), mr_out_dir)) for fn in ls: fo.write("%s\n" % hdfs.path.abspath(fn, user=args.hdfs_user)) mr_out_dir = run_phase_two(args, nm, input_, logger) return mr_out_dir
def _get_hopsfs_dataset_files(training_dataset_location, split): path = training_dataset_location.replace("hopsfs", "hdfs") if split is None: path = hdfs.path.abspath(path) else: path = hdfs.path.abspath(path + "/" + str(split)) input_files = [] all_list = hdfs.ls(path, recursive=True) # Remove directories and spark '_SUCCESS' file if any for file in all_list: if not hdfs.path.isdir(file) and not file.endswith("_SUCCESS"): input_files.append(file) return input_files
def main(argv=sys.argv): os.chdir(os.path.dirname(os.path.abspath(__file__))) parser = make_parser() args = parser.parse_args(argv[1:]) logging.basicConfig() LOGGER.setLevel(args.log_level) paths = None if hdfs.path.isfile(args.stats_path): paths = [args.stats_path] else: try: ls = hdfs.ls(args.stats_path) except IOError as e: return "ERROR: %s: %s" % (args.stats_path, e) paths = [ _ for _ in ls if re.match(r"^part-m-\d+$", hdfs.path.basename(_)) ] gen_plots(paths, args.out_dir)
def collect_output(mr_out_dir, logger): builder = None for fn in hdfs.ls(mr_out_dir): if not hdfs.path.basename(fn).startswith("part"): continue logger.info("processing %r" % (fn, )) with hdfs.open(fn) as f: s = zlib.decompress(f.read()) if s == "": continue vectors = KinshipVectors.deserialize( s) # ignores trailing newline char if builder is None: builder = KinshipBuilder(vectors) else: builder.vectors += vectors logger.info("building kinship matrix") return builder.build()
def collate_mapred_output(output_dir): data = {"weights": {}, "biases": {}} pattern = re.compile(r"part-m-\d+-(\d+)-(weights|biases).npz") for path in hdfs.ls(output_dir): LOGGER.debug("processing: %s", path) m = pattern.match(hdfs.path.basename(path)) if not m: continue seed, what = m.groups() with hdfs.open(path, "rb") as f: npzf = np.load(f) data[what].update( {"%s_%s" % (seed, t): w for (t, w) in npzf.iteritems()}) for k, v in data.items(): out_path = hdfs.path.join(output_dir, "%s.npz" % k) LOGGER.info("saving collated %s to %s", k, out_path) with hdfs.open(out_path, "wb") as f: np.savez(f, **v)
def _read_hopsfs(self, location, data_format): # providing more informative error try: from pydoop import hdfs except ModuleNotFoundError: return self._read_hopsfs_rest(location, data_format) util.setup_pydoop() path_list = hdfs.ls(location, recursive=True) df_list = [] for path in path_list: if ( hdfs.path.isfile(path) and not path.endswith("_SUCCESS") and hdfs.path.getsize(path) > 0 ): df_list.append(self._read_pandas(data_format, path)) return df_list
def walk(self, parent_path, file_list, hdfs_parent_path, hdfs_file_list): print("%s %s" % (parent_path, hdfs_parent_path)) if len(file_list) == 0 and len(hdfs_file_list) == 0: if os.path.basename(parent_path) == path.basename( hdfs_parent_path): return True return False elif len(file_list) != len(hdfs_file_list): print("No match: number of files in dirs") return False else: file_list.sort( key=lambda f: os.path.isfile(os.path.join(parent_path, f))) hdfs_file_list.sort( key=lambda f: path.isfile(path.join(hdfs_parent_path, f))) hIdx = 0 for idx, sub_path in enumerate(file_list): full_path = os.path.join(parent_path, sub_path) hdfs_sub_path = hdfs_file_list[idx] hdfs_full_path = path.join(hdfs_parent_path, hdfs_sub_path) if (os.path.basename(sub_path) != path.basename(hdfs_sub_path)): print("No match: %s and %s" % (sub_path, hdfs_sub_path)) return False if os.path.isdir(full_path): if path.isdir(hdfs_full_path) == False: print("No match on directory: %s and %s" % (full_path, hdfs_full_path)) return False return self.walk(full_path, os.listdir(full_path), hdfs_full_path, hdfs.ls(hdfs_full_path)) elif os.path.isfile(full_path): sz = os.path.getsize(full_path) hdfs_size = path.getsize(hdfs_full_path) if (hdfs_size != sz): return False return True
def write_output(names, fin, fout): # # in similarity search when no results are found, results file is not present # # so create an empty local file to indicate no results found # if os.path.isdir(fin) == False: # fd = open(fout, "w") # fd.close() # return files = hdfs.ls(fin) # find file that has "part-" in the filename; it is the result for f in files: if "part-" in f: break with hdfs.open(f) as fd: result = pd.read_csv(fd, sep='\t', header=None, names=["id", "id 2", "Euclidean Distance"]) result = result.merge(names, on="id", how='inner') result.rename(columns={ 'name': 'Entity 1', 'id': 'id 1', 'id 2': 'id' }, inplace=True) result = result.merge(names, on="id", how='inner') result.rename(columns={'name': 'Entity 2', 'id': 'id 2'}, inplace=True) del result['id 1'] del result['id 2'] result = result.sort_values(by=["Euclidean Distance"]) result[['Entity 1', 'Entity 2', 'Euclidean Distance']].to_csv(fout, index=False, sep='\t')
def rename_compressed_files(self, file_table): # find the extension output_files = hdfs.ls(self.output_path) if len(output_files) == 0: return compressor_extension = self.get_compressor_extension(output_files) self.log.debug("compressor extension is %s", compressor_extension) hdfs_host, hdfs_port, _ = hdfs.path.split(output_files[0]) if hdfs_host == '': is_local_fs = True else: is_local_fs = False output_hdfs = hdfs.hdfs(hdfs_host, hdfs_port) file_table.seek(0) for mapid, line in enumerate(file_table.xreadlines()): _, _, relative_output_name = line.rstrip('\n').split('\t') # we expect the map task ids to be assigned in the same order as the input # file list, so we can match the input file to an output file by its position # in the input file list. hadoop_output = os.path.join(self.output_path, "part-%05d" % mapid) + compressor_extension desired_file_name = os.path.join(self.output_path, relative_output_name) + compressor_extension if hadoop_output != desired_file_name: self.log.debug("renaming %s to %s", hadoop_output, desired_file_name) if is_local_fs: # Though we could transparently use hdfs.move for both local fs and hdfs, # using native methods for the local fs should be faster. # os.renames automatically creates necessary parent directories for destination. os.renames(urlparse(hadoop_output).path, urlparse(desired_file_name).path) else: # create the output subdirectory, if necessary dirname = os.path.dirname(relative_output_name) if dirname: output_hdfs.create_directory( os.path.join(self.output_path, dirname) ) if output_hdfs.exists(desired_file_name): raise RuntimeError("Can't overwrite file in output directory: %s" % desired_file_name) output_hdfs.move(hadoop_output, output_hdfs, desired_file_name)
def readText(filePath = ""): import pydoop.hdfs as hdfs import os file_is_file = hdfs.path.isfile(filePath) file_is_dir = hdfs.path.isdir(filePath) file_exist = hdfs.path.exists(filePath) try: if(file_is_file): files = hdfs.open(filePath) elif(file_is_dir): files = [] for pieceFile in hdfs.ls(filePath): files += hdfs.open(pieceFile) elif(not file_exist): if(os.path.exists(filePath)): print "[WARN] file not found on hdfs read local file." files = open(filePath) except Exception as e: raise e finally: print type(files) return files
def main(): # this is hdfs directory src_dir = str(sys.argv[1]) dst_dir = str(sys.argv[2]) # create dst_dir if not exist if not pyhdfs.path.exists(dst_dir): pyhdfs.mkdir(dst_dir) # create sparkcontext spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext # create children path rdd children_paths = pyhdfs.ls(src_dir) children_paths_rdd = sc.parallelize(children_paths, len(children_paths)) # each executor task is to copy one children path children_paths_rdd.foreach(lambda file_path: copy_file( file_path, os.path.join(dst_dir, os.path.basename(file_path)))) # stop sparkcontext sc.stop()
def glob(hdfs_path, recursive=False, project=None): """ Finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order. Globbing gives you the list of files in a dir that matches a supplied pattern >>> import glob >>> glob.glob('./[0-9].*') >>> ['./1.gif', './2.txt'] glob is implemented as os.listdir() and fnmatch.fnmatch() We implement glob as hdfs.ls() and fnmatch.filter() Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS. :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS. Raises: IOError if the supplied hdfs path does not exist Returns: A possibly-empty list of path names that match pathname, which must be a string containing a path specification. pathname can be either absolute """ # Get the full path to the dir for the input glob pattern # "hdfs://Projects/jim/blah/*.jpg" => "hdfs://Projects/jim/blah" # Then, ls on 'hdfs://Projects/jim/blah', then filter out results if project == None: project = project_name() lastSep = hdfs_path.rfind("/") inputDir = hdfs_path[:lastSep] inputDir = _expand_path(inputDir, project) pattern = hdfs_path[lastSep + 1:] if not hdfs.path.exists(inputDir): raise IOError("Glob path %s not found" % inputDir) dirContents = hdfs.ls(inputDir, recursive=recursive) return fnmatch.filter(dirContents, pattern)
def xml_from_hdfs(url): assert hdfs.path.isdir(url) file_lists = hdfs.ls(url) #for fi in file_lists: for i in xrange(0, 1): fi = '/datasets/corpus/enwiki-11g/wiki_912' with hdfs.open(fi, "r") as f: lines = f.read().strip().split('\n') docs, doc = [], None for line in lines: if line.startswith('<doc'): doc = line elif line.startswith('</doc>'): docs.append(doc + line) else: #line = line.replace('&', '').replace('"', "'") doc += line.replace('"', "'") for doc in docs: dom = bs(doc).find('doc') doc = dom.attrs doc['content'] = dom.text doc['md5'] = hashlib.md5(str(doc)).hexdigest() yield doc
def _poly_ls(path, *args, **kwargs): if path.startswith('hdfs:'): return hdfs.ls(path, *args, **kwargs) else: return os.listdir(path)
def iter_mr_out_files(mr_out_dir): for fn in hdfs.ls(mr_out_dir): if hdfs.path.basename(fn).startswith("part"): yield fn
def rmr(self): for wd in self.local_wd, self.hdfs_wd: t1 = self.__make_tree(wd) hdfs.rmr(t1.name) self.assertEqual(len(hdfs.ls(wd)), 0)
import glob from elasticsearch import Elasticsearch import urllib2 import re import pydoop.hdfs as hdfs EsHost = { "host" : "localhost", "port" : 9200 } HDFSfiles=[] for hdFiles in hdfs.ls("/gaana/gaanaLyrics"): #"gaanaLyrics/gaanaLyrics"): HDFSfiles.append(hdFiles[41:]) fileNames = [] indexName = 'music' typeName = 'songs' #IdField = 'songID' bulkData = [] i = 1 for name in HDFSfiles: dataDict = {} fopen=hdfs.open("/gaana/gaanaLyrics/"+name) header = fopen.read()
def mrjob(options): "Generates and executes MR job script" user = os.getenv('USER') tstamp = int(time.time()) hdir = hdfs_dir(options.hdir, options.hdfs_prefix) if PYDOOP: odir = hdfs.path.join(hdir, options.odir) idir = hdfs.path.join(hdir, options.idir) schema = hdfs.path.join(hdir, options.schema) for name in [hdir, odir, idir,]: if options.verbose: print("Checking %s" % name) if not hdfs.path.isdir(name): if name in [hdir, idir]: print("ERROR: %s does not exist" % name) sys.exit(1) # else: # print(" Creating output directory: %s" % name) # hdfs.mkdir(name) elif name == odir: # in case odir exists and is not empty, move it somewhere and re-create if hdfs.ls(odir): ocache = hdfs.path.normpath(odir)+'_%d'%tstamp if options.verbose: print(" Non-empty output directory exists, saving it in %s"%ocache) hdfs.move(odir, ocache) # hdfs.mkdir(odir) # if it's empty, remove it else: hdfs.rmr(odir) if options.verbose: print("Checking %s" % schema) if not hdfs.path.isfile(schema): print("ERROR: %s does not exist" % schema) sys.exit(1) else: idir = '%s%s' % (hdir, 'data') odir = '%s%s' % (hdir, 'mrout') schema = '%s%s' % (hdir, options.schema) if options.verbose: msg = 'pydoop module is not present on this system' msg += ', will use input as is without checking' print('WARNING:', msg) for name in [options.mrpy, options.pydoop, options.avro]: if options.verbose: print("Checking %s" % name) if not os.path.isfile(name): print("ERROR: %s does not exist" % name) sys.exit(1) # module = os.path.basename(os.path.splitext(options.mrpy)[0]) code = create_mrpy(options.mrpy, options.verbose) cmd = """#!/bin/bash input={input} output={output} schema={schema} ifile=/tmp/mr_{user}_{tstamp}.py cat << EOF > $ifile {code} EOF module=mr_{user}_{tstamp} arch_pydoop={pydoop} arch_avro={avro} echo "Input URI : $input" echo "Output URI: $output" echo "Schema: $schema" echo "MR script : $ifile" echo "Module name : $module" echo "Pydoop archive: $arch_pydoop" echo "Avro archive : $arch_avro" echo "-----------------" echo "Submitting MR job" pydoop submit \ --upload-archive-to-cache $arch_pydoop \ --upload-archive-to-cache $arch_avro \ -D avro.schema=$schema \ --do-not-use-java-record-reader \ --log-level {loglevel} \ --job-name WMArchive \ --num-reducers 1 \ --upload-file-to-cache $ifile \ --mrv2 $module $input $output """.format(input=idir, output=odir, user=user, tstamp=tstamp, code=code, schema=schema, loglevel=options.loglevel, pydoop=os.path.abspath(options.pydoop), avro=os.path.abspath(options.avro)) fobj = NamedTemporaryFile(delete=False) fobj.write(cmd) fobj.close() fstat = os.stat(fobj.name) os.chmod(fobj.name, fstat.st_mode | stat.S_IEXEC) if options.execute: run(fobj.name, options.verbose) else: if options.verbose: print("------- Generated script --------") print(open(fobj.name, 'r').read()) if options.verbose: print("---------------------------------") # clean up temporary file os.unlink(fobj.name)