def scan_event_files(env): hdfs = env['hdfs'] host, port = hdfs.split(':') client = Client(host, int(port), use_trash=False, effective_user='******') event_files = [] basename = '_'.join(os.path.basename(env['first_clip']).split('_')[:-1]) event_dir = os.path.join(env['event_dir'], basename) if not client.test(event_dir, exists=True, directory=True): return event_files for item in client.ls([event_dir]): if item['file_type'] == 'f': event_files.append(os.path.basename(item['path'])) return event_files
class HDFSStat(object): cluster = 'hostname' port = 8020 default_path = '/user/hive/warehouse' @staticmethod def build_path(table): nm = table.split('.')[0] tb = table.split('.')[1] return default_path + '/' + nm + '.db/' + tb def __init__(self): self.client = Client(HDFSStat.cluster, HDFSStat.port, use_trash=False) def latest_partition(self, table_name, table_path=None): t_path = HDFSStat.build_path(table_name) if table_path is None else table_path latest_dir = list(self.client.ls([t_path])).pop() return path.basename(latest_dir['path']).split('=')[1] def poke_partition(self, table_name, partition_name, partition, table_path=None): t_path = HDFSStat.build_path(table_name) if table_path is None else table_path partition_path = t_path + '/' + partition_name + '=' + partition return self.client.test(partition_path, exists=True, directory=True, zero_length=False)
def compose_hdfs_commands(year, month, day, args, config): # set up the hdfs client to be used in order to check the files namenode = config.get("HDFS", "namenode") client = Client(namenode.hostname, namenode.port, use_trash=False) # hdfs sync path for the tenant hdfs_user = config.get("HDFS", "user") tenant = args.tenant hdfs_sync = config.get("HDFS", "path_sync") hdfs_sync = hdfs_sync.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl() hdfs_metric = config.get("HDFS", "path_metric") hdfs_metric = hdfs_metric.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl() # dictionary holding all the commands with their respective arguments' name hdfs_commands = dict() # file location of previous day's metric data (local or hdfs) hdfs_commands["--pdata"] = hdfs_check_path( hdfs_metric + "/" + str(datetime.date(year, month, day) - datetime.timedelta(1)), client) # file location of target day's metric data (local or hdfs) hdfs_commands["--mdata"] = hdfs_check_path(hdfs_metric + "/" + args.date, client) # file location of report configuration json file (local or hdfs) hdfs_commands["--conf"] = hdfs_check_path(hdfs_sync + "/" + args.tenant+"_"+args.report+"_cfg.json", client) # file location of metric profile (local or hdfs) hdfs_commands["--mps"] = date_rollback( hdfs_sync + "/" + args.report + "/" + "metric_profile_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of operations profile (local or hdfs) hdfs_commands["--ops"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_ops.json", client) # file location of aggregations profile (local or hdfs) hdfs_commands["--apr"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_"+args.report+"_ap.json", client) if args.thresholds: # file location of thresholds rules file (local or hdfs) hdfs_commands["--thr"] = hdfs_check_path( os.path.join(hdfs_sync, "".join([args.tenant, "_", args.report, "_thresholds.json"])), client) # file location of endpoint group topology file (local or hdfs) hdfs_commands["-egp"] = date_rollback( hdfs_sync + "/" + args.report + "/" + "group_endpoints_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of group of groups topology file (local or hdfs) hdfs_commands["-ggp"] = date_rollback(hdfs_sync + "/" + args.report + "/" + "group_groups_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of weights file (local or hdfs) hdfs_commands["--weights"] = date_rollback(hdfs_sync + "/" + args.report + "/weights_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of downtimes file (local or hdfs) hdfs_commands["--downtimes"] = hdfs_check_path( hdfs_sync + "/" + args.report + "/downtimes_" + str(datetime.date(year, month, day)) + ".avro", client) # file location of recomputations file (local or hdfs) # first check if there is a recomputations file for the given date # recomputation lies in the hdfs in the form of # /sync/recomp_TENANTNAME_ReportName_2018-08-02.json if client.test(urlparse(hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json").path, exists=True): hdfs_commands["--rec"] = hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json" else: hdfs_commands["--rec"] = hdfs_check_path(hdfs_sync+"/recomp.json", client) return hdfs_commands
swift_client = swift.Connection(user=swift_user, key=swift_key, authurl=swift_authurl) # read list of files src_files = [] if run_mode == "hdfs": # spotify's snakebite as hdfs client src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ] # deleting output directory if exists if (hdfs_client.test(target_dir, exists=True, directory=True)): hdfs_client.delete(target_dir) hdfs_client.rmdir(target_dir) elif run_mode == "swift": # read list of files from swift src_files = [] src_file_regex = re.compile(source_files) for data in swift_client.get_container(source_dir)[1]: if src_file_regex.match(data['name']): src_files.append(data['name']) src_files.sort(key=lambda x: os.path.basename(x)) else: # read list of files from local src_files = filter(os.path.isfile,
def is_exist(dirPath, master = public.SPARK_MASTER, port = public.SPARK_MASTER_PORT): client = Client(master, port, use_trash=False) return client.test(dirPath, exists=True, directory=True)
def exists(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) return client.test(self._partial, exists=True)
def exists(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) return client.test(self._partial, exists=True)
swiftConf.set(key, value) swift_client = swift.Connection( user = swift_user, key = swift_key, authurl = swift_authurl) # read list of files src_files = [] if run_mode == "hdfs": # spotify's snakebite as hdfs client src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ] # deleting output directory if exists if (hdfs_client.test(target_dir, exists = True, directory = True)): hdfs_client.delete(target_dir) hdfs_client.rmdir(target_dir) elif run_mode == "swift": # read list of files from swift src_files = [] source_files = '|'.join([ '(pagecounts-' + (datetime.now() - timedelta(hours=i)).strftime("%Y%m%d-%H") + '(.*))' for i in range(48, 71) ]) src_file_regex = re.compile(source_files) for data in swift_client.get_container(source_dir)[1]: if src_file_regex.match(data['name']): src_files.append(data['name']) src_files.sort(key = lambda x: os.path.basename(x)) else: # read list of files from local
#!/usr/bin/env python from snakebite.client import Client client = Client("trevally.amer.nevint.com", 9000, use_trash=False) def list_recursive(path): for x in client.ls([path]): if x['file_type']=='d': list_recursive(x['path']) else: print x['path'] target = '/data/hub/vehicle/MKZ-Grey' if client.test(target,directory=True): list_recursive(target)