Exemplo n.º 1
0
def scan_event_files(env):
    hdfs = env['hdfs']
    host, port = hdfs.split(':')
    client = Client(host, int(port), use_trash=False, effective_user='******')
    event_files = []

    basename = '_'.join(os.path.basename(env['first_clip']).split('_')[:-1])
    event_dir = os.path.join(env['event_dir'], basename)

    if not client.test(event_dir, exists=True, directory=True):
        return event_files

    for item in client.ls([event_dir]):
        if item['file_type'] == 'f':
            event_files.append(os.path.basename(item['path']))
    return event_files
Exemplo n.º 2
0
class HDFSStat(object):

    cluster = 'hostname'
    port = 8020
    default_path = '/user/hive/warehouse'

    @staticmethod
    def build_path(table):
        nm = table.split('.')[0]
        tb = table.split('.')[1]
        return default_path + '/' + nm + '.db/' + tb

    def __init__(self):
        self.client = Client(HDFSStat.cluster, HDFSStat.port, use_trash=False)

    def latest_partition(self, table_name, table_path=None):
        t_path = HDFSStat.build_path(table_name) if table_path is None else table_path
        latest_dir = list(self.client.ls([t_path])).pop()
        return path.basename(latest_dir['path']).split('=')[1]

    def poke_partition(self, table_name, partition_name, partition, table_path=None):
        t_path = HDFSStat.build_path(table_name) if table_path is None else table_path
        partition_path = t_path + '/' + partition_name + '=' + partition
        return self.client.test(partition_path, exists=True, directory=True, zero_length=False)
Exemplo n.º 3
0
def compose_hdfs_commands(year, month, day, args, config):

    # set up the hdfs client to be used in order to check the files
    namenode = config.get("HDFS", "namenode")
    client = Client(namenode.hostname, namenode.port, use_trash=False)

    # hdfs sync  path for the tenant

    hdfs_user = config.get("HDFS", "user")
    tenant = args.tenant
    hdfs_sync = config.get("HDFS", "path_sync")
    hdfs_sync = hdfs_sync.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl()

    hdfs_metric = config.get("HDFS", "path_metric")

    hdfs_metric = hdfs_metric.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl()

    # dictionary holding all the commands with their respective arguments' name
    hdfs_commands = dict()

    # file location of previous day's metric data (local or hdfs)
    hdfs_commands["--pdata"] = hdfs_check_path(
        hdfs_metric + "/" + str(datetime.date(year, month, day) - datetime.timedelta(1)), client)

    # file location of target day's metric data (local or hdfs)
    hdfs_commands["--mdata"] = hdfs_check_path(hdfs_metric + "/" + args.date, client)

    # file location of report configuration json file (local or hdfs)
    hdfs_commands["--conf"] = hdfs_check_path(hdfs_sync + "/" + args.tenant+"_"+args.report+"_cfg.json", client)

    # file location of metric profile (local or hdfs)
    hdfs_commands["--mps"] = date_rollback(
        hdfs_sync + "/" + args.report + "/" + "metric_profile_" + "{{date}}" + ".avro", year, month, day, config,
        client)

    # file location of operations profile (local or hdfs)
    hdfs_commands["--ops"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_ops.json",  client)

    # file location of aggregations profile (local or hdfs)
    hdfs_commands["--apr"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_"+args.report+"_ap.json", client)

    if args.thresholds:
        # file location of thresholds rules file (local or hdfs)
        hdfs_commands["--thr"] = hdfs_check_path(
            os.path.join(hdfs_sync, "".join([args.tenant, "_", args.report, "_thresholds.json"])), client)

    #  file location of endpoint group topology file (local or hdfs)
    hdfs_commands["-egp"] = date_rollback(
        hdfs_sync + "/" + args.report + "/" + "group_endpoints_" + "{{date}}" + ".avro", year, month, day, config,
        client)

    # file location of group of groups topology file (local or hdfs)
    hdfs_commands["-ggp"] = date_rollback(hdfs_sync + "/" + args.report + "/" + "group_groups_" + "{{date}}" + ".avro",
                                          year, month, day, config, client)

    # file location of weights file (local or hdfs)
    hdfs_commands["--weights"] = date_rollback(hdfs_sync + "/" + args.report + "/weights_" + "{{date}}" + ".avro", year,
                                               month, day, config, client)

    # file location of downtimes file (local or hdfs)
    hdfs_commands["--downtimes"] = hdfs_check_path(
        hdfs_sync + "/" + args.report + "/downtimes_" + str(datetime.date(year, month, day)) + ".avro", client)

    # file location of recomputations file (local or hdfs)
    # first check if there is a recomputations file for the given date
    # recomputation lies in the hdfs in the form of
    # /sync/recomp_TENANTNAME_ReportName_2018-08-02.json
    if client.test(urlparse(hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json").path, exists=True):
        hdfs_commands["--rec"] = hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json"
    else:
        hdfs_commands["--rec"] = hdfs_check_path(hdfs_sync+"/recomp.json", client)

    return hdfs_commands
Exemplo n.º 4
0
    swift_client = swift.Connection(user=swift_user,
                                    key=swift_key,
                                    authurl=swift_authurl)

# read list of files
src_files = []

if run_mode == "hdfs":
    # spotify's snakebite as hdfs client
    src_files = [
        hdfs_url + files['path'] for files in hdfs_client.ls([source_files])
    ]

    # deleting output directory if exists
    if (hdfs_client.test(target_dir, exists=True, directory=True)):
        hdfs_client.delete(target_dir)
        hdfs_client.rmdir(target_dir)

elif run_mode == "swift":
    # read list of files from swift  src_files = []
    src_file_regex = re.compile(source_files)
    for data in swift_client.get_container(source_dir)[1]:
        if src_file_regex.match(data['name']):
            src_files.append(data['name'])

    src_files.sort(key=lambda x: os.path.basename(x))

else:
    # read list of files from local
    src_files = filter(os.path.isfile,
Exemplo n.º 5
0
def is_exist(dirPath, master = public.SPARK_MASTER, port = public.SPARK_MASTER_PORT):
    client = Client(master, port, use_trash=False)
    return client.test(dirPath, exists=True, directory=True)
Exemplo n.º 6
0
 def exists(self):
     client = Client(self._host, self._port, effective_user=self._user, use_trash=False)
     return client.test(self._partial, exists=True)
Exemplo n.º 7
0
 def exists(self):
     client = Client(self._host,
                     self._port,
                     effective_user=self._user,
                     use_trash=False)
     return client.test(self._partial, exists=True)
    swiftConf.set(key, value)

  swift_client = swift.Connection(
    user = swift_user, 
    key = swift_key, 
    authurl = swift_authurl)

# read list of files
src_files = []

if run_mode == "hdfs":
  # spotify's snakebite as hdfs client
  src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ]

  # deleting output directory if exists
  if (hdfs_client.test(target_dir, exists = True, directory = True)):
    hdfs_client.delete(target_dir)
    hdfs_client.rmdir(target_dir)

elif run_mode == "swift":  
  # read list of files from swift  src_files = []
  source_files = '|'.join([ '(pagecounts-' + (datetime.now() - timedelta(hours=i)).strftime("%Y%m%d-%H") + '(.*))' for i in range(48, 71) ])
  src_file_regex = re.compile(source_files)
  for data in swift_client.get_container(source_dir)[1]:
     if src_file_regex.match(data['name']):
       src_files.append(data['name'])
  
  src_files.sort(key = lambda x: os.path.basename(x))

else:
  # read list of files from local
Exemplo n.º 9
0
#!/usr/bin/env python

from snakebite.client import Client
client = Client("trevally.amer.nevint.com", 9000, use_trash=False)


def list_recursive(path):
    for x in client.ls([path]):
        if x['file_type']=='d':
            list_recursive(x['path'])
        else:
            print x['path']

target = '/data/hub/vehicle/MKZ-Grey'

if  client.test(target,directory=True):
    list_recursive(target)