def run(self): c = Client(self.host, self.port) listing = c.ls([self.log_path], recurse=True) for f in listing: path = f['path'] if not path.endswith('.jhist'): continue ts = arrow.get(f['modification_time'] / 1000) if ts <= self.checktime: continue job_id = job_pattern.match(path.split('/')[-1]).group(0) if job_id in self.jobs and self.jobs[job_id] >= ts.timestamp * 1000: log.debug('Skipping processed job: ' + job_id) continue config_path = path[:path.rfind('/')] + '/' + job_id + '_conf.xml' event = { 'inviso.type': 'mr2', 'job.id': job_id, 'application.id': job_id.replace('job_', 'application_'), 'job.type': 'mr2', 'file.type': ['history', 'config'], 'jobflow': self.jobflow, 'cluster.id': self.cluster_id, 'cluster': self.cluster_name, 'history.uri': 'hdfs://%s:%s%s' % (self.host, self.port, path), 'config.uri': 'hdfs://%s:%s%s' % (self.host, self.port, config_path), 'host': self.host, 'port': self.port, 'timestamp': str(ts), 'epoch': f['modification_time'], 'mapreduce.version': 'mr2' } log.info('Publishing event: (%s) %s %s' % (event['cluster'], event['job.id'], ts)) self.publisher.publish([event])
def get_df_pats(): df_paths = [] HDFS_CLIENT = Client(CONF.HDFS_HOST, 9000, use_trash=False) for file_entry in HDFS_CLIENT.ls(['/user/root']): if 'df_joined_df' in file_entry['path']: continue df_paths.append(file_entry['path']) return df_paths
def scan_files(env): hdfs = env['hdfs'] host, port = hdfs.split(':') client = Client(host, int(port), use_trash=False, effective_user='******') input_files = [] for item in client.ls([env['input']]): if item['file_type'] == 'd': input_files.append(item['path']) return input_files
def get_DB(Hive_Warehouse): DB={} DB_re=[] client = Client('yhbd01',8020,use_trash=False) list_hive = list(client.ls([Hive_Warehouse])) for x in list_hive: DB=x DB_re.append(DB['path']) return DB_re
def test(): """ """ client = Client("192.168.99.100", 9000) for f in client.ls(['/files']): print f for line in client.cat([f.get('path')]): for l in line: print l
class HDFSClient: __client = None def __init__(self): self.client = Client("localhost", 9000) @staticmethod def get_instance(self): def test(self): for x in self.client.ls(['/rush/input/']): print (x)
def get_DB(Hive_Warehouse): DB = {} DB_re = [] client = Client('yhbd01', 8020, use_trash=False) list_hive = list( client.ls([Hive_Warehouse], include_toplevel=False, include_children=True, recurse=True)) for x in list_hive: DB = x DB_re.append(DB['path']) #print 'hive表路径扫描完毕!' return DB_re
def metrics(): print "Recieved metrics request..." metric_prefix = "hdfs_directory_stats" metrics = {"the_number_one": "1"} c = Client("namenode", 8020) filepaths = map(lambda entry: entry['path'], c.ls([sys.argv[1]])) lines = reduce(lambda a, b: a + b, [1 for f in c.cat(filepaths) for _ in f]) metrics['lines_of_text_in_directory'] = lines template_kwargs = { 'metrics': metrics, 'dir': sys.argv[1], 'metric_prefix': metric_prefix } return Response(render_template("metrics", **template_kwargs), mimetype='text/plain')
def scan_event_files(env): hdfs = env['hdfs'] host, port = hdfs.split(':') client = Client(host, int(port), use_trash=False, effective_user='******') event_files = [] basename = '_'.join(os.path.basename(env['first_clip']).split('_')[:-1]) event_dir = os.path.join(env['event_dir'], basename) if not client.test(event_dir, exists=True, directory=True): return event_files for item in client.ls([event_dir]): if item['file_type'] == 'f': event_files.append(os.path.basename(item['path'])) return event_files
def run(self): c = Client(self.host, self.port) listing = c.ls([self.log_path], recurse=True) events = [] for f in listing: path = f['path'] if not path.endswith('.jhist'): continue ts = arrow.get(f['modification_time']/1000) if ts <= self.checktime: continue job_id = job_pattern.match(path.split('/')[-1]).group(0) if job_id in self.jobs and self.jobs[job_id] >= ts.timestamp*1000: log.debug('Skipping processed job: ' + job_id) continue config_path = path[:path.rfind('/')]+'/'+job_id+'_conf.xml' event = { 'inviso.type': 'mr2', 'job.id': job_id, 'application.id': job_id.replace('job_', 'application_'), 'job.type': 'mr2', 'file.type': ['history', 'config'], 'jobflow' : self.jobflow, 'cluster.id': self.cluster_id, 'cluster': self.cluster_name, 'history.uri': 'hdfs://%s:%s%s' % (self.host,self.port,path), 'config.uri':'hdfs://%s:%s%s' % (self.host,self.port,config_path), 'host': self.host, 'port': self.port, 'timestamp': str(ts), 'epoch': f['modification_time'], 'mapreduce.version': 'mr2' } log.info('Publishing event: (%s) %s %s' % (event['cluster'], event['job.id'], ts)) events.append(event) for chunk in [events[i:i + self.chunk_size] for i in xrange(0, len(events), self.chunk_size)]: self.publisher.publish(chunk)
def getTrainedModel(hdfsServer, modelFile): hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + modelFile + '/' # Load the saved model data hdfs_client = Client(hdfsServer, hdfsPort) filesInfo = hdfs_client.ls([modelSavePath]) # Copy HDFS files to local temp directory # First clean up and recreate the temp folder copyDir = tempfile.gettempdir() + "/" + modelFile shutil.rmtree(copyDir, ignore_errors=True) os.makedirs(copyDir) res = hdfs_client.copyToLocal([f['path'] for f in filesInfo], copyDir) for r in res: if not r['result']: print "Error: %s" % r modelFilePath = copyDir + '/' + modelFile print "Load model from %s" % modelFilePath return joblib.load(modelFilePath)
class HDFSStat(object): cluster = 'hostname' port = 8020 default_path = '/user/hive/warehouse' @staticmethod def build_path(table): nm = table.split('.')[0] tb = table.split('.')[1] return default_path + '/' + nm + '.db/' + tb def __init__(self): self.client = Client(HDFSStat.cluster, HDFSStat.port, use_trash=False) def latest_partition(self, table_name, table_path=None): t_path = HDFSStat.build_path(table_name) if table_path is None else table_path latest_dir = list(self.client.ls([t_path])).pop() return path.basename(latest_dir['path']).split('=')[1] def poke_partition(self, table_name, partition_name, partition, table_path=None): t_path = HDFSStat.build_path(table_name) if table_path is None else table_path partition_path = t_path + '/' + partition_name + '=' + partition return self.client.test(partition_path, exists=True, directory=True, zero_length=False)
#!/usr/bin/env python from snakebite.client import Client import time host = '10.118.205.8' port = 9000 client = Client(host=host, port=port, use_trash=False, effective_user='******') path = '/tmp' result = [] for x in client.ls([path]): result.append(x) ordered = sorted(result, key=lambda x: x['path']) for f in ordered: if f['file_type'] == 'd': print f['path'] else: print f
from snakebite.client import Client client = Client('localhost', 9000) for x in client.ls(['/']): print x
def main(args): xml = minidom.parse( path.join(os.environ["HADOOP_HOME"], "etc", "hadoop", "hdfs-site.xml")) element = [ x for x in xml.getElementsByTagName("property") if (x.getElementsByTagName("name")[0].childNodes[0].nodeValue == "dfs.namenode.http-address") ][0] namenode = (element.getElementsByTagName("value") [0].childNodes[0].nodeValue.split(":")[0]) fs = HDFS(namenode, 8020) path_prefix = "/amplab/text" for size in args.sizes: timings = {} MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() file_list = None if c_rank == 0: file_list = [ entry["path"] for entry in fs.ls( [path.join(path_prefix, size, "uservisits")]) ] file_list = [file_list[i::c_size] for i in range(c_size)] file_list = MPI.COMM_WORLD.scatter(file_list, root=0) MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["open-and-register"] = toc() MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() os_results = reduce_data(row_iterator(file_list, fs), 4, "os") MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["q-stats-by-os"] = toc() if c_rank == 0: os_results.index = os_results.pop("os") MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() browser_results = reduce_data(row_iterator(file_list, fs), 6, "browser") MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["q-stats-by-browser"] = toc() if c_rank == 0: browser_results.index = browser_results.pop("browser") if c_rank == 0: top_dir = path.join("results", size, "mpi", str(args.nodes)) mkdir_p(top_dir) with open(path.join(top_dir, "timings"), "w") as f: for entry in timings.items(): f.write("%s, %.18e\n" % entry) f.flush() browser_results.to_pickle(path.join(top_dir, "browser")) os_results.to_pickle(path.join(top_dir, "os")) return 0
def __init__(self, sc, doclist, ngram_range = [1,1], vocab = None, stop_words = None, nmin = None, nmax = None, num_partitions = None, features_max = None, tokenizer = alpha_tokenizer, hashing = False, load_path = None, hdfs_namenode = None) : self._sc = sc self._ngram_range = ngram_range self._vocab = vocab self._stop_words = stop_words self._nmin = nmin self._nmax = nmax self._num_partitions = num_partitions self._doclist = doclist self._features_max = features_max if features_max is not None else 2**31 self._tokenizer = tokenizer # initialie the RDDs self._doc_rdd = None self._ngram_rdd = None self._vocab_rdd = None self._docvec_rdd = None self._vocab_map_rdd = None # dictionary of RDDs self.rdds = {} # initialize other properties self._nfeatures = None self._hashing = hashing # make the vocabulary a set if it isn't one already if type(vocab) is not set and vocab is not None: try: self._vocab = set(vocab) except TypeError : raise TypeError("Vocabulary must be an iterable like a list, set, etc.") if load_path is not None : if load_path[:4] != 'hdfs' : for rdd_name in os.listdir(load_path) : if rdd_name[-3:] == 'rdd' : self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name) # we're dealing with HDFS else : try : from snakebite.client import Client except ImportError : raise ImportError("package snakebite is required for working with HDFS: pip install snakebite") if hdfs_namenode is None : # get the hadoop configuration files from user's environment and extract namenode import xml hadoop_conf = '%s/core-site.xml'%os.environ['HADOOP_CONF_DIR'] tree = xml.etree.ElementTree.parse(hadoop_conf) for prop in tree.findall('property') : if prop.find('name').text == 'fs.defaultFS' : dummy, hdfs_namenode, hdfs_port = prop.find('value').text.split(':') hdfs_namenode = hdfs_namenode[2:] break client = Client(hdfs_namenode, int(hdfs_port)) for rdd_path_dict in client.ls([load_path[7:]]) : rdd_name = rdd_path_dict['path'].split('/')[-1] if rdd_name[-3:] == 'rdd': self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name) print 'Loaded %d RDDs: '%(len(self.rdds)) for rdd in self.rdds.keys() : print rdd # make the vital properties dictionary for pickling self.properties = {'ngram_range': ngram_range, 'stop_words': stop_words, 'nmin': nmin, 'nmax': nmax, 'num_partitions': num_partitions, 'doclist': doclist, 'features_max': features_max, 'hashing': hashing, }
def get_locations(filename, name_host, name_port, **kwargs): client = Client(name_host, name_port, use_trash=False) files = list(client.ls([filename])) return [pair for file in files for pair in find(file, client, **kwargs)]
class Loader: """ The idea of the loader is to provide a convenient interface to create a new table based on some input files """ def __init__(self, path, name_node, hive_server, user="******", hive_db="default", password=None, nn_port=8020, hive_port=10000): # HDFS Connection self._client = Client(name_node, nn_port) self._db = hive_db # Hive Connection self._hive = pyhs2.connect(host=hive_server, port=hive_port, authMechanism="PLAIN", database=hive_db, user=user, password=password) self._path = path def load(self): # Check data to see which kind it is files = self._client.ls([self._path]) files = [f for f in files if f['file_type'] == 'f'] if len(files) == 0: raise Exception("Cannot load empty directory") # Pick the first file and assume that it has the same content as the others data = self.head(files[0]['path']) res = self.check_separator(data) if res == None: # We cant load the data and better abort here print("cant load data, cannot find a separator") return sep = res[0] num_cols = res[1] # Build table statement table_statement, table_name = self._create_table(self._path, sep, num_cols) cursor = self._hive.cursor() cursor.execute(table_statement) return self._db, table_name def _create_table(self, path, sep, count): buf = """CREATE EXTERNAL TABLE pyxplorer_data ( %s )ROW FORMAT DELIMITED FIELDS TERMINATED BY '%s' STORED AS TEXTFILE LOCATION '%s' """ % (",".join(["col_%d string" % x for x in range(count)]), sep, path) return buf, "pyxplorer_data" def check_separator(self, data): """ THis method evaluates a list of separators on the input data to check which one is correct. This is done by first splitting the input by newline and then checking if the split by separator is equal for each input row except the last that might be incomplete due to the limited input data :param data: input data to check :return: """ sep_list = [r'\t', r';', r',', r'|', r'\s+'] data_copy = data for sep in sep_list: # Check if the count matches each line splitted = data_copy.split("\n") parts = [len(re.split(sep, line)) for line in splitted] # If we did not split anything continue if sum(parts) == len(splitted): continue diff = 0 for i in range(len(parts[1:-1])): diff += abs(parts[i] - parts[i + 1]) if diff == 0: return sep, parts[0] # If we reach this point we did not find a separator return None def head(self, file_path): """ Onlye read the first packets that come, try to max out at 1024kb :return: up to 1024b of the first block of the file """ processor = lambda path, node, tail_only=True, append=False: self._handle_head( path, node) # Find items and go for item in self._client._find_items([file_path], processor, include_toplevel=True, include_children=False, recurse=False): if item: return item def _handle_head(self, path, node, upper=1024 * 1024): data = '' for load in self._client._read_file(path, node, tail_only=False, check_crc=False): data += load if (len(data) > upper): return data return data
from snakebite.client import Client client = Client('localhost', 9000) for x in client.ls(['/']): print x
# not work, snakebite only support python2 from snakebite.client import Client client = Client('119.23.182.3', 9000) for x in client.ls(['/data', '/lookfit/test/logs/user-service/20201205/']): print(x)
#!/usr/bin/env python from snakebite.client import Client # this line creates the client connection to the HDFS NameNode # NameNode hostname = localhost, NameNode port = 9000 # these parameters are set in hadoop/conf/core-site.xml under fs.defaultFS client = Client('localhost', 9000) # list the content of the HDFS root directory # note that many methods in snakebite returns generators for x in client.ls(['/user/cbohara']): print x
def display(): client = Client("study", 9000, use_trash=False) for x in client.ls(['/data/gz']): print x
def get_locations(filename, name_host, name_port, data_root='/data/dfs/dn'): client = Client(name_host, name_port, use_trash=False) files = list(client.ls([filename])) return [pair for file in files for pair in find(file, client, data_root)]
from snakebite.client import Client client = Client('localhost', 54310) for x in client.ls(['/input.txt']): print x
return (citydb.city(ip).country.name or u'Unknown').encode() if __name__ == '__main__': if len(sys.argv) != 3: print >> sys.stderr, "Usage: forgeInternationalAccess <date> <hour>" exit(-1) spark = SparkContext(appName='ForgeGeoAccess') spark.addPyFile('hdfs://digiledap/user/spark/share/lib/accessLogParser.py') spark.addFile('hdfs://digiledap/user/spark/share/lib/GeoLite2-City.mmdb') from accessLogParser import * from snakebite.client import Client hdfsHandle = Client('hmaster01') hosts = spark.parallelize(hdfsHandle.ls(['/flume/events/apache_access_combined/']))\ .filter(lambda dirs: dirs['file_type'] == 'd')\ .map(lambda directory: 'hdfs://digiledap%s' % directory['path'])\ .collect() rdds = { item.split('/')[-1]: spark.textFile('%s/%s/%s' % (item, sys.argv[1], sys.argv[2])) for item in hosts } results = { key: rdds[key].map(lambda log: Parser.create(Parser.COMBINED).parse(log)) .map(lambda log: (((log['timestamp'] - timedelta(minutes=log['timestamp'].minute % 5)) .replace(second=0), _getCountryByIP(log['remote_ip'].compressed)), 1)) .reduceByKey(add).map(lambda x: (key, x[0][0], x[0][1], x[1])) for key in rdds
#!/usr/bin/env python from snakebite.client import Client import time host='10.118.205.8' port=9000 client = Client(host=host, port=port, use_trash=False, effective_user='******') path='/tmp' result=[] for x in client.ls([path]): result.append(x) ordered=sorted(result, key=lambda x: x['path']) for f in ordered: if f['file_type'] == 'd': print f['path'] else: print f
view_map = view_events_arr.map(lambda line: (line[2].split('_')[0], 1)) # OUTPUT ads_bid_count_by_company = bid_map.reduceByKey(lambda a, b: a + b) ads_view_count_by_company = view_map.reduceByKey(lambda a, b: a + b) print "======== Result =========\n" print ads_bid_count_by_company.take(2), ads_view_count_by_company.take(20) print "======== Result =========\n" ads_bid_count_by_company.saveAsTextFile("hdfs://ec2-52-72-23-2.compute-1.amazonaws.com:9000/user/ubuntu/testdan.txt") sc.stop() if __name__ == "__main__": client = Client('ec2-52-72-23-2.compute-1.amazonaws.com', 9000, use_trash=False) last_modification_time = sys.argv[1] list_of_new_files = [dict for dict in client.ls(['/']) if dict['modification_time'] > last_modification_time] # CONFIGURE SPARK conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) # FILE TO PROCESS filename = sys.argv[1] # CALLING MAIN main(sc, filename)
from snakebite.client import Client client = Client('localhost', 54310) for x in client.ls(['/user/hduser/']): print x
def get_locations(filename, name_host, name_port, **kwargs): client = Client(name_host, name_port, use_trash=False) files = list(client.ls([filename])) return [pair for file in files for pair in find(file, client, **kwargs)]
class HDFS_topic(object): def __init__(self, topic, user, server, port, web_port, base, hdfs_tmp): self.topic = topic self.username = user self.server = server self.port = port self.base = base self.path = ["%s/%s" % (base, topic)] self.hdfs_tmp = hdfs_tmp try: self.client = Client(server, port, effective_user=user) self.hdfsclient=hdfs.client.InsecureClient(\ "http://%s:%d" % (server,web_port),user=user) self.daylist = self.check() except: print "Base path %s does not contain valid structure" % (base) raise # # Check basic hdfs access and that directory format is appropiate # also builds datelist structure # def check(self): self.content = self.client.ls(self.path) ret = [] for item in self.content: (head, tail) = os.path.split(item['path']) try: parse(tail, yearfirst=True, dayfirst=True) if item['file_type'] == 'd': ret.append(tail) else: print("WARNING: %s is not a directory, skipping\n" % (item['path'])) except: print("WARNING: %s is not in date format, skipping\n" % (tail)) if len(ret) > 0: ret.sort(key=lambda x: datetime.strptime(x, "%Y-%m-%d")) return ret else: return false # # Give a date, check if that date is on the dirlist and return matching dir entry # def day_in_topic(self, date): for item in self.daylist: if parse(date) == parse(item): return item return False # # Check and validates date_from and date_to arguments # def check_date_range(self, date_from, date_to): if date_from: try: parse(date_from) except: raise ValueError("FATAL: start date (%s) invalid date format" % (date_from)) if (parse(date_from) < parse(self.daylist[0])) or ( parse(date_from) > parse(self.daylist[-1])): raise ValueError( "FATAL: start date (%s) not in range (%s ---> %s)" % (date_from, self.daylist[0], self.daylist[-1])) else: ret_from = parse(date_from).strftime("%Y-%m-%d") while not self.day_in_topic(ret_from): print "WARNING: start date %s not in topic %s, trying next day" % ( ret_from, self.topic) ret_from = datetime.strftime( (parse(ret_from) + timedelta(days=1)), "%Y-%m-%d") ret_from = self.day_in_topic(ret_from) else: ret_from = self.daylist[0] if date_to: try: parse(date_to) except: raise ValueError("FATAL: end date (%s) invalid date format" % (date_to)) if (parse(date_to) < parse(self.daylist[0])) or ( parse(date_to) > parse(self.daylist[-1])): raise ValueError( "FATAL: end date (%s) not in range (%s ---> %s)" % (date_to, self.daylist[0], self.daylist[-1])) else: ret_to = parse(date_to).strftime("%Y-%m-%d") else: ret_to = self.daylist[-1] if (parse(ret_from) > parse(ret_to)): raise ValueError( "FATAL: start date (%s) must be <= end date (%s)" % (ret_from, ret_to)) return (ret_from, ret_to) # # Traverses the list of valid directories and merges each day # def merge(self, date_from="", date_to=""): day = "" try: (day, date_to) = self.check_date_range(date_from, date_to) except Exception as err: raise ValueError(err) print "INFO: Trying to merge %s from %s to %s\n" % (self.topic, day, date_to) while (parse(day) <= parse(date_to)): if self.day_in_topic(day): self.merge_day(day) else: print "WARNING: %s is not on %s, skipping\n" % (day, self.path) day = datetime.strftime((parse(day) + timedelta(days=1)), "%Y-%m-%d") while not self.day_in_topic(day) and parse(day) <= parse(date_to): print "WARNING: %s not found in %s, trying next day" % ( day, self.topic) day = datetime.strftime((parse(day) + timedelta(days=1)), "%Y-%m-%d") day = self.day_in_topic(day) if not day: return return True # # Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything # def merge_day(self, date): print "INFO: processing ", date daytmp = "%s/snappymerge-%s-tmp" % (self.hdfs_tmp, date) daypath = ["%s/%s/%s/" % (self.base, self.topic, date)] #mergedfile="./%s-merged.snappy" % (date) mergedfile = "./%s-merged.snappy" % (datetime.strftime( datetime.now(), "%Y-%d-%m.%f")) day_files = [x['path'] for x in self.client.ls(daypath)] print "INFO: DAYPATH: ", daypath try: os.remove(mergedfile) except: pass if len([x for x in day_files if x.endswith('.snappy')]) <= 1: print "WARNING: %s does not have enough files to getmerge, skipping" % ( date) return if [file for file in day_files if not file.endswith('.snappy')]: print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % ( daypath, file, daytmp) self.merge_with_move(daypath[0], daytmp, day_files, mergedfile) else: print "INFO: MERGING ", daypath[0] result = self.client.getmerge(daypath[0], mergedfile) print[x for x in result if not x['result']] print "INFO: DELETING original files in ", daypath[0] for file in day_files: print "INFO: Deleting original file ", file self.hdfsclient.delete(file) print "INFO: UPLOADING merged (%s) to %s" % (mergedfile, daypath[0]) self.hdfsclient.upload(daypath[0], mergedfile, overwrite=True) os.remove(mergedfile) return # # When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads # def merge_with_move(self, day_path, day_tmp, dayfiles, merged_file): self.hdfsclient.makedirs(day_tmp) print "INFO: MOVING files to ", day_tmp snap = [x for x in dayfiles if x.endswith(".snappy")] result = self.client.rename(snap, day_tmp) print[x['path'] for x in result if not x['result']] print "INFO: MERGING files in ", day_tmp result = self.client.getmerge(day_tmp, merged_file) print[x['path'] for x in result if not x['result']] print "INFO: UPLOADING merged (%s) to %s" % (merged_file, day_path) self.hdfsclient.upload(day_path, merged_file, overwrite=True) os.remove(merged_file) print "INFO: Deleting files on ", day_tmp self.hdfsclient.delete(day_tmp, recursive=True)
from snakebite.client import Client client = Client('localhost', 54310) for x in client.ls(['/user/hduser/']): print x
def health_check(): c = Client("namenode", 8020) print "Checking for %s directory..." % sys.argv[1] for top_level in c.ls([sys.argv[1]]): print "DIR CHILD=%s" % top_level['path'] print "Ok!"
from snakebite.client import Client client = Client('localhost', 8020) #port is the RPC port of the namenode. for i in client.ls(['/user/cloudera/behrouz']): #takes a list of paths!! print i #get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults #many of the methods in snake bite return generators #creating a directory: #create two directories behrouz, behrouz1/b1 on HDFS: print '*' * 40 for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True): print p print '*' * 40 #deleting files and directories: deletes any subdirectories and files a directory contains #recursively deleting the directories! for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True): print p print '*' * 40 # retrieving data from hdfs: #copying files from HDFS to Local file system: for f in client.copyToLocal(['/user/cloudera/wordCount.out'], '/home/cloudera/'): print f print '*' * 40 ####### #reading contents of a file for l in client.text(['/user/cloudera/testfile.txt']): print l #the text method automatically decompress and display gzip and bzip2 files.
def main(args): xml = minidom.parse(path.join(os.environ["HADOOP_HOME"], "etc", "hadoop", "hdfs-site.xml")) element = [ x for x in xml.getElementsByTagName("property") if (x.getElementsByTagName("name")[0] .childNodes[0] .nodeValue == "dfs.namenode.http-address") ][0] namenode = (element.getElementsByTagName("value")[0] .childNodes[0] .nodeValue.split(":")[0]) fs = HDFS(namenode, 8020) path_prefix = "/amplab/text" for size in args.sizes: timings = {} MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() file_list = None if c_rank == 0: file_list = [ entry["path"] for entry in fs.ls([ path.join(path_prefix, size, "uservisits")])] file_list = [file_list[i::c_size] for i in range(c_size)] file_list = MPI.COMM_WORLD.scatter(file_list, root=0) MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["open-and-register"] = toc() MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() os_results = reduce_data(row_iterator(file_list, fs), 4, "os") MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["q-stats-by-os"] = toc() if c_rank == 0: os_results.index = os_results.pop("os") MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() browser_results = reduce_data(row_iterator(file_list, fs), 6, "browser") MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["q-stats-by-browser"] = toc() if c_rank == 0: browser_results.index = browser_results.pop("browser") if c_rank == 0: top_dir = path.join("results", size, "mpi", str(args.nodes)) mkdir_p(top_dir) with open(path.join(top_dir, "timings"), "w") as f: for entry in timings.items(): f.write("%s, %.18e\n" % entry) f.flush() browser_results.to_pickle(path.join(top_dir, "browser")) os_results.to_pickle(path.join(top_dir, "os")) return 0
if __name__ == '__main__': # hdfs_host='100.127.6.35' hdfs_host='100.127.13.16' # hdfs_port=9820 hdfs_port=8020 client = Client(host=hdfs_host, port= hdfs_port, use_trash=False, effective_user='******') if len(sys.argv) < 2: print 'inf_verification.py path' sys.exit(0) input_dir=sys.argv[1] input_files=[] for clip in client.ls([input_dir]): if clip['file_type'] == 'd': input_files.append(clip['path']) for folder in sorted(input_files): for inf in client.cat([getInf(folder)]): for content in inf: start=None end=None for aline in content.split('\n'): if aline.startswith('startTime'): start=aline.strip() elif aline.startswith('endTime'): end=aline.strip() print '{}\t{}\t{}'.format(os.path.basename(folder),start, end)
if (run_mode == "swift" or out_mode == "swift"): swiftConf = sc._jsc.hadoopConfiguration() for key, value in SWIFT_DEFAULT_CONFIGS.items(): swiftConf.set(key, value) swift_client = swift.Connection(user=swift_user, key=swift_key, authurl=swift_authurl) # read list of files src_files = [] if run_mode == "hdfs": # spotify's snakebite as hdfs client src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ] # deleting output directory if exists if (hdfs_client.test(target_dir, exists=True, directory=True)): hdfs_client.delete(target_dir) hdfs_client.rmdir(target_dir) elif run_mode == "swift": # read list of files from swift src_files = [] src_file_regex = re.compile(source_files) for data in swift_client.get_container(source_dir)[1]: if src_file_regex.match(data['name']): src_files.append(data['name']) src_files.sort(key=lambda x: os.path.basename(x))
class HDFS_topic(object): def __init__(self,topic,user,server,port,web_port,base,hdfs_tmp): self.topic = topic self.username = user self.server = server self.port = port self.base = base self.path = ["%s/%s" % (base,topic)] self.hdfs_tmp = hdfs_tmp try: self.client=Client(server,port,effective_user=user) self.hdfsclient=hdfs.client.InsecureClient(\ "http://%s:%d" % (server,web_port),user=user) self.daylist=self.check() except: print "Base path %s does not contain valid structure" % (base) raise # # Check basic hdfs access and that directory format is appropiate # also builds datelist structure # def check(self): self.content=self.client.ls(self.path) ret=[] for item in self.content: (head,tail) = os.path.split(item['path']) try: parse(tail,yearfirst=True,dayfirst=True) if item['file_type'] == 'd': ret.append(tail) else: print("WARNING: %s is not a directory, skipping\n" % (item['path'])) except: print("WARNING: %s is not in date format, skipping\n" % (tail)) if len(ret) > 0: ret.sort(key=lambda x: datetime.strptime(x,"%Y-%m-%d")) return ret else: return false # # Give a date, check if that date is on the dirlist and return matching dir entry # def day_in_topic(self, date): for item in self.daylist: if parse(date) == parse(item): return item return False # # Check and validates date_from and date_to arguments # def check_date_range(self,date_from,date_to): if date_from: try: parse(date_from) except: raise ValueError("FATAL: start date (%s) invalid date format" % (date_from) ) if ( parse(date_from) < parse(self.daylist[0]) ) or ( parse(date_from) > parse(self.daylist[-1]) ): raise ValueError("FATAL: start date (%s) not in range (%s ---> %s)" % (date_from,self.daylist[0],self.daylist[-1])) else: ret_from=parse(date_from).strftime("%Y-%m-%d") while not self.day_in_topic(ret_from): print "WARNING: start date %s not in topic %s, trying next day" % (ret_from,self.topic) ret_from=datetime.strftime((parse(ret_from)+timedelta(days=1)), "%Y-%m-%d" ) ret_from=self.day_in_topic(ret_from) else: ret_from=self.daylist[0] if date_to: try: parse(date_to) except: raise ValueError("FATAL: end date (%s) invalid date format" % (date_to) ) if ( parse(date_to) < parse(self.daylist[0]) ) or ( parse(date_to) > parse(self.daylist[-1]) ): raise ValueError("FATAL: end date (%s) not in range (%s ---> %s)" % (date_to,self.daylist[0],self.daylist[-1])) else: ret_to=parse(date_to).strftime("%Y-%m-%d") else: ret_to=self.daylist[-1] if (parse(ret_from) > parse(ret_to) ): raise ValueError("FATAL: start date (%s) must be <= end date (%s)" % (ret_from,ret_to)) return (ret_from,ret_to) # # Traverses the list of valid directories and merges each day # def merge(self,date_from="",date_to=""): day="" try: (day,date_to)=self.check_date_range(date_from,date_to) except Exception as err: raise ValueError(err) print "INFO: Trying to merge %s from %s to %s\n" % (self.topic,day, date_to) while (parse(day) <= parse(date_to)): if self.day_in_topic(day): self.merge_day(day) else: print "WARNING: %s is not on %s, skipping\n" % (day,self.path) day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" ) while not self.day_in_topic(day) and parse(day) <= parse(date_to): print "WARNING: %s not found in %s, trying next day" % (day,self.topic) day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" ) day=self.day_in_topic(day) if not day: return return True # # Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything # def merge_day(self,date): print "INFO: processing ", date daytmp="%s/snappymerge-%s-tmp" % (self.hdfs_tmp,date) daypath=["%s/%s/%s/" % (self.base, self.topic,date)] #mergedfile="./%s-merged.snappy" % (date) mergedfile="./%s-merged.snappy" % (datetime.strftime(datetime.now(),"%Y-%d-%m.%f")) day_files=[x['path'] for x in self.client.ls(daypath)] print "INFO: DAYPATH: ", daypath try: os.remove(mergedfile) except: pass if len([ x for x in day_files if x.endswith('.snappy') ]) <= 1: print "WARNING: %s does not have enough files to getmerge, skipping" % (date) return if [ file for file in day_files if not file.endswith('.snappy') ]: print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % (daypath,file,daytmp) self.merge_with_move(daypath[0],daytmp,day_files,mergedfile) else: print "INFO: MERGING ", daypath[0] result=self.client.getmerge(daypath[0],mergedfile) print [x for x in result if not x['result']] print "INFO: DELETING original files in ", daypath[0] for file in day_files: print "INFO: Deleting original file ", file self.hdfsclient.delete(file) print "INFO: UPLOADING merged (%s) to %s" % (mergedfile,daypath[0]) self.hdfsclient.upload(daypath[0],mergedfile,overwrite=True) os.remove(mergedfile) return # # When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads # def merge_with_move(self,day_path,day_tmp,dayfiles,merged_file): self.hdfsclient.makedirs(day_tmp) print "INFO: MOVING files to ", day_tmp snap = [x for x in dayfiles if x.endswith(".snappy")] result=self.client.rename(snap,day_tmp) print [ x['path'] for x in result if not x['result']] print "INFO: MERGING files in ", day_tmp result=self.client.getmerge(day_tmp,merged_file) print [x['path'] for x in result if not x['result']] print "INFO: UPLOADING merged (%s) to %s" % (merged_file,day_path) self.hdfsclient.upload(day_path,merged_file,overwrite=True) os.remove(merged_file) print "INFO: Deleting files on ", day_tmp self.hdfsclient.delete(day_tmp,recursive=True) if __name__ == '__main__' : import argparse count=0 parser = argparse.ArgumentParser(description="Merge daily historical snappy files into one to save hdfs space") parser.add_argument('topic', help="Topic name relative to --base") parser.add_argument('--hdfs_user', help="HDFS user name (default: current user)",default=None) parser.add_argument('--hdfs_server', help="HDFS server name or ip (default: aquhmstsys022001.c022.digitalriverws.net)",default="aquhmstsys022001.c022.digitalriverws.net") parser.add_argument('--hdfs_port', help="HDFS server port number (default:8020)", type=int, default=8020) parser.add_argument('--hdfs_tmp', help="HDFS temporary dir to store files to be merged (default:/user/hduser/tmp)", default="/user/hduser/tmp") parser.add_argument('--web_port', help="HDFS server WEB port number (default:50070)", type=int, default=50070) parser.add_argument('--base', help="Alternate hdfs base path for topic (default:/user/aqueduct/flume)",default="/user/aqueduct/flume") parser.add_argument('--start', help="Start Date inclusive (default: from beginning)") parser.add_argument('--end', help="End Date inclusive (default: to end)") args = parser.parse_args() topic=HDFS_topic(topic=args.topic,user=args.hdfs_user,server=args.hdfs_server,port=args.hdfs_port,\ hdfs_tmp=args.hdfs_tmp,web_port=args.web_port,base=args.base) try: topic.merge(args.start,args.end) except Exception as err: print err exit
def get_locations(filename, name_host, name_port, data_root='/data/dfs/dn'): client = Client(name_host, name_port, use_trash=False) files = list(client.ls([filename])) return [pair for file in files for pair in find(file, client, data_root)]
if (run_mode == "swift" or out_mode == "swift"): swiftConf = sc._jsc.hadoopConfiguration() for key, value in SWIFT_DEFAULT_CONFIGS.items(): swiftConf.set(key, value) swift_client = swift.Connection( user = swift_user, key = swift_key, authurl = swift_authurl) # read list of files src_files = [] if run_mode == "hdfs": # spotify's snakebite as hdfs client src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ] # deleting output directory if exists if (hdfs_client.test(target_dir, exists = True, directory = True)): hdfs_client.delete(target_dir) hdfs_client.rmdir(target_dir) elif run_mode == "swift": # read list of files from swift src_files = [] source_files = '|'.join([ '(pagecounts-' + (datetime.now() - timedelta(hours=i)).strftime("%Y%m%d-%H") + '(.*))' for i in range(48, 71) ]) src_file_regex = re.compile(source_files) for data in swift_client.get_container(source_dir)[1]: if src_file_regex.match(data['name']): src_files.append(data['name']) src_files.sort(key = lambda x: os.path.basename(x))
def test_request(self): from snakebite.client import Client client = Client("10.0.137.24", 8022, use_trash=False) for x in client.ls(['/user']): print x
Библиотеки не оказлось - устанавливаем pip install snakebite Выясняем адрес и порт для запросов hdfs getconf -confKey fs.defaultFS Дальше запускаем терминал Python и работаем в нем (как вариант - можно подготовить скрипт и запускать его) ''' from snakebite.client import Client client = Client('manager.novalocal', 8020) # Посмотрим, что у нас есть в рабочей директории for x in client.ls(['/student9_7']): print(x) ''' {'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605967318187L, 'block_replication': 3, 'modification_time': 1605967318265L, 'length': 1705L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/cur_readme'} {'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605953216696L, 'block_replication': 3, 'modification_time': 1605953220706L, 'length': 7104L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/googlobots.txt'} {'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605966686950L, 'block_replication': 3, 'modification_time': 1605966688013L, 'length': 1705L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/readme'} {'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605964109596L, 'block_replication': 2, 'modification_time': 1605946691680L, 'length': 19L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/test'} {'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605964267111L, 'block_replication': 3, 'modification_time': 1605964267975L, 'length': 19L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/test2'} {'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1605950057832L, 'length': 0L, 'blocksize': 0L, 'owner': u'student9_7', 'path': '/student9_7/testdir'} ''' # Создадим пару директорий for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'], create_parent=True): print(p) '''