def _hdfs_initialize(): global _hdfs_client if _hdfs_client is None: _hdfs_client = hdfs.InsecureClient(_hdfs_url, root=_hdfs_root, user=_hdfs_user_name) _hdfs_client.set_permission(_hdfs_root, 777)
def hdfs_client(self): import hdfs logger.debug("HDFSConfig: %s", self.hdfscli_config) try: return hdfs.config.Config(self.hdfscli_config).get_client( self.alias) except hdfs.util.HdfsError as exc: exc_msg = str(exc) errors = ( "No alias specified", "Invalid configuration file", f"Alias {self.alias} not found", ) if not any(err in exc_msg for err in errors): raise http_url = f"http://{self.host}:{self.port}" logger.debug("URL: %s", http_url) if self.token is not None: client = hdfs.TokenClient(http_url, token=self.token, root="/") else: client = hdfs.InsecureClient(http_url, user=self.user, root="/") return client
def __init__(self, host=None, port=None, user=None): host = self.get_config('namenode_host') if host is None else host port = self.get_config('namenode_port') if port is None else port user = self.get_config('user') if user is None else os.environ['USER'] url = 'http://' + host + ':' + port self.webhdfs = webhdfs.InsecureClient(url=url, user=user)
def __init__(self, config): self.paths = config['paths'] self.mode = int(config['mode'], 8) self.user = config['user'] self.group = config['group'] # FIXME: Handle rotated files keeping correctly # self.keep_files = int(config['rotate']) self.compress = config['compress'] self.copy = config['copy'] self.copytohdfs = config['copytohdfs'] self.hdfs_config = config['hdfs'] self.hdfs_client = None if self.hdfs_config: self.hdfs_client = hdfs.InsecureClient(**self.hdfs_config) self.dateformat = config['dateformat'] self.now = datetime.datetime.now() self.timestamp = self.now.strftime(self.dateformat) self.destext = config['destext'] self.fnformat = config['fnformat'] if not self.fnformat: raise ValueError("'fnformat' cannot be empty") self.sharedscripts = config['sharedscripts'] self.prerotates = config['prerotate'] self.postrotates = config['postrotate'] self.queuepath = config['queuepath'] self.queue_chunksize = 1000 self.queue_block_timeout = 30 self.queue = Queue(self.queuepath, self.queue_chunksize)
def lec(adresseHdfs, repDataHdfs): schema = { "namespace": "ffo.hashtag", "type": "record", "name": "Node", "fields": [ { "name": "datehashtag", "type": "string" }, { "name": "timestamp", "type": "int" }, { "name": "hashtags", "type": { "type": "array", "items": "string" }, "default": {} }, ] } hdfs_client = hdfs.InsecureClient(adresseHdfs, schema) with hdfs_client.read(repDataHdfs) as of: reader = fastavro.reader(of) for node in reader: print(node)
def __init__( self, jobs, jobname=datetime.now().strftime( "%Y%m%d%H%M%S" ), warcs=None, viral=None, logs=None, start_date=None, dummy_run=False, hash_cache_file=None, client=None ): """Sets up fields.""" if client is None: self.client = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) else: self.client = client self.dummy = dummy_run if self.dummy: logger.info("This is a dummy-run - no real ARKs will be minted.") self.overwrite = False self.jobs = jobs self.jobname = jobname self.warcs = warcs self.viral = viral self.logs = logs self.hash_cache = {} self.parse_hash_cache(hash_cache_file) self.startdate = start_date self.BAGIT_CONTACT_NAME="Andrew N. Jackson" self.BAGIT_CONTACT_EMAIL="*****@*****.**" self.BAGIT_DESCRIPTION="LD Crawl: " self.ARK_URL="http://pii.ad.bl.uk/pii/vdc?arks=" self.ARK_PREFIX="ark:/81055/vdc_100022535899.0x" # And create: logger.info("Processing job files...") self.processJobs() logger.info("Generating METS...") self.createMets()
def __init__(self, queue, hdfs_base_url, path): self.queue = queue self.client = hdfs.InsecureClient(hdfs_base_url) self.base_path = path self.thread = threading.Thread(target=self.run, args=()) self.thread.daemon = True self.thread.start()
def __init__(self, pipeline_options): """Initializes a connection to HDFS. Connection configuration is done by passing pipeline options. See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`. """ super(HadoopFileSystem, self).__init__(pipeline_options) logging.getLogger('hdfs.client').setLevel(logging.WARN) if pipeline_options is None: raise ValueError('pipeline_options is not set') if isinstance(pipeline_options, PipelineOptions): hdfs_options = pipeline_options.view_as(HadoopFileSystemOptions) hdfs_host = hdfs_options.hdfs_host hdfs_port = hdfs_options.hdfs_port hdfs_user = hdfs_options.hdfs_user else: hdfs_host = pipeline_options.get('hdfs_host') hdfs_port = pipeline_options.get('hdfs_port') hdfs_user = pipeline_options.get('hdfs_user') if hdfs_host is None: raise ValueError('hdfs_host is not set') if hdfs_port is None: raise ValueError('hdfs_port is not set') if hdfs_user is None: raise ValueError('hdfs_user is not set') self._hdfs_client = hdfs.InsecureClient('http://%s:%s' % (hdfs_host, str(hdfs_port)), user=hdfs_user)
def __init__(self): # 对于WebHDFS的读写,在不考虑Kuberos的情况下,需要将/etc/hosts文件配置和Hadoop集群一样的IP self.HDFSClient = hdfs.InsecureClient(WebHDFSAddr, root='/', user="******") self.localInputPathTemp = localInputPathTemp self.localOutputPathTemp = localOutputPathTemp self.NginxAddr = NginxAddr
def client(self): # A naive benchmark showed that 1000 existence checks took 2.5 secs # when not recreating the client, and 4.0 secs when recreating it. So # not urgent to memoize it. Note that it *might* be issues with process # forking and whatnot (as the one in the snakebite client) if we # memoize it too trivially. import hdfs return hdfs.InsecureClient(url=self.url, user=self.user)
def get_all_identifiers(sip): """Parses the SIP in HDFS and retrieves FILE/ARK tuples.""" # client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) client = hdfs.InsecureClient(HDFS_URL, HDFS_USER) tar = "%s/%s.tar.gz" % (SIP_ROOT, sip) status = client.status(tar,strict=False) if status: # Catch empty packages: if status['length'] == 0: logger.warning("Empty (zero byte) SIP package: %s" % tar) yield None else: with client.read(tar) as reader: t = reader.read() # Open the package: tar = tarfile.open(mode="r:gz", fileobj=StringIO(t)) foundMets = False for i in tar.getmembers(): logger.debug("Examining %s" % i.name) if i.name.endswith(".xml"): foundMets = True xml = tar.extractfile(i).read() try: tree = etree.fromstring(xml) files = {} n_files = 0 for mfile in tree.xpath("//mets:file", namespaces=NS): #logger.debug("Found mets:file = %s " % etree.tostring(mfile)) admid = mfile.attrib["ADMID"] logger.info("Found mets:file admid = %s " % admid) path = mfile.xpath("mets:FLocat", namespaces=NS)[0].attrib["%shref" % XLINK] files[admid] = { "path": path, "mimetype": mfile.attrib["MIMETYPE"], "size": mfile.attrib["SIZE"], "checksum_type": mfile.attrib["CHECKSUMTYPE"], "checksum": mfile.attrib["CHECKSUM"] } n_files = n_files + 1 if len(files.keys()) != n_files: logger.error("ERROR, more files than IDs") n_amdsecs = 0 for amdsec in tree.xpath("//mets:amdSec", namespaces=NS): #logger.debug("Found mets:amdSec = %s " % etree.tostring(amdsec)) admid = amdsec.attrib["ID"] logger.info("Found mets:amdSec id = %s " % admid) oiv = amdsec.xpath("mets:digiprovMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=NS) if oiv and len(oiv) == 1: files[admid]['ark'] = oiv[0].text n_amdsecs = n_amdsecs + 1 logger.debug("Yielding %s" % files[admid] ) yield files[admid] else: logger.info("Skipping amdSec ID=%s" % admid) if n_files != n_amdsecs: logger.error("ERROR finding all amdSec elements") except IndexError as i: logger.error("Problem parsing METS for SIP: %s" % sip) logger.exception(i) if not foundMets: logger.error("No METS XML file found!") else: logger.warning("Could not find SIP: hdfs://%s" % tar)
def __get_connection(self): try: self.__hdfs_client = hdfs.InsecureClient( "http://" + self.__active_nn + ":" + self.__port, user=self.__hdfs_user) logger.debug("established connection with hdfs!") except Exception as error: logger.error("cannot establish connection with hdfs!") raise error
def main(number,workers): spark = SparkSession\ .builder\ .appName("Locomotion-Extract-TDMS")\ .getOrCreate() sc = spark.sparkContext hdfs.InsecureClient(SERVER).makedirs('/generated/output'+number) files = glob.glob('generated/input'+number+'/**/*.tdms') g = sc.parallelize(files, workers).map(lambda f : transform_and_store(f,number)).collect() spark.stop()
def do_hdfs_upload(myid, url, safe_url, keyhash, fname, fdata, ftag): # enforce teh file name to be the public hash prefixed or sealed prefix if not fname.startsWith("/" + keyhash) or not fname.startsWith(sealed_prefix): fname = "/%s/%s" % (keyhash, fname) # create hdfs client on demand now. No need to optimize for PoC hdfs_client = hdfs.InsecureClient(url, user=ftag) return hdfs_client.write(fname, fdata, overwrite=True, permission=666)
def do_hdfs_upload(url, keyhash, fname, fdata, ftag): # enforce teh file name to be the public hash prefixed or sealed prefix if not fname.startswith("/" + keyhash) or not fname.startswith(sealed_prefix): fname = "/%s/%s" % (keyhash, fname) dirname = os.path.dirname(fname) prepare_user_dir(url, dirname, keyhash) # create hdfs client on demand now. No need to optimize for PoC hdfs_client = hdfs.InsecureClient(url, user=ftag) hdfs_client.write(fname, fdata, overwrite=True, permission=666) return "file written"
def create(self): """ Creates webhdfs client instance. Concrete implementation depends on a client_type parameter, if it's kerberos, then KerberosClient is created, otherwise InsecureClient. :return hdfs client: """ if self.client_type == 'kerberos': from hdfs.ext.kerberos import KerberosClient return KerberosClient(url=self.url) else: return hdfs.InsecureClient(url=self.url, user=self.user)
def client_and_path(path): _, domain, folder_path, file_pattern = parse_file_uri(path) if ':' not in domain: port = 50070 else: domain, port = domain.split(':') port = int(port) cache_id = domain + '__' + str(port) if cache_id not in Hdfs._conn: Hdfs._conn[cache_id] = hdfs.InsecureClient( # pylint: disable=no-member 'http://{0}:{1}'.format(domain, port)) return Hdfs._conn[cache_id], folder_path + file_pattern
def find_identifiers(output_file): with open(output_file, 'w') as f: client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) for (path, dirs, files) in client.walk(SIP_ROOT): logger.info("Looking at path "+path) for file in files: logger.info("Looking at file " + file) if file.endswith('.tar.gz'): sip = "%s/%s" % (path, file) sip = sip[len(SIP_ROOT) + 1:] sip = sip[:-7] logger.info("Scanning %s..." % sip) for waid in get_all_identifiers(sip): f.write("%s %s\n" % (sip, waid) )
def calculateHash(path): logger.info("Starting to generate hash for %s" % path) client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) sha = hashlib.sha512() with client.read(path) as file: while True: data = file.read(10485760) if not data: file.close() break sha.update(data) logger.info("Finished generating hash for %s" % path) return sha.hexdigest()
def calculateHash(path, client=None): if client is None: client = hdfs.InsecureClient('http://hdfs.gtw.wa.bl.uk:14000', user='******') logger.info("Starting to generate hash for %s" % path) sha = hashlib.sha512() with client.read(path) as file: while True: data = file.read(10485760) if not data: file.close() break sha.update(data) logger.info("Finished generating hash for %s" % path) return sha.hexdigest()
def __init__(self, date, warcs, viral, logs, identifiers, hash_cache=None, client=None): if client is None: self.client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) else: self.client = client self.warcs = [] self.viral = [] self.date = date self.wq = Queue() self.vq = Queue() self.hash_cache = hash_cache for i in range(NUM_THREADS): worker = Thread(target=create_warcs, args=(self.wq, self.warcs, self)) worker.setDaemon(True) worker.start() for warc in warcs: self.wq.put(warc) self.wq.join() for i in range(NUM_THREADS): worker = Thread(target=create_warcs, args=(self.vq, self.viral, self)) worker.setDaemon(True) worker.start() for warc in viral: self.vq.put(warc) self.vq.join() self.logs = [] for log in logs: self.logs.append(ZipContainer(path=log, parent=self)) self.identifiers = identifiers self.createDomainMets() self.createCrawlerMets()
def client_and_path(path): _, domain, folder_path, file_pattern = parse_file_uri(path) if ':' not in domain: port = 50070 else: domain, port = domain.split(':') port = int(port) cache_id = domain + '__' + str(port) if cache_id not in Hdfs._conn: if hdfs is None: raise FileSystemNotSupported( 'hdfs not supported. Install the python package "hdfs".') Hdfs._conn[cache_id] = hdfs.InsecureClient( # pylint: disable=no-member f'http://{domain}:{port}') return Hdfs._conn[cache_id], folder_path + file_pattern
def client_and_path(path): # obtain key t = Tokenizer(path) t.next('://') # skip scheme domain = t.next('/') path = t.next() if ':' not in domain: port = 50070 else: domain, port = domain.split(':') port = int(port) cache_id = domain + '__' + str(port) if cache_id not in Hdfs._conn: Hdfs._conn[cache_id] = hdfs.InsecureClient('http://{0}:{1}'.format( domain, port)) return (Hdfs._conn[cache_id], '/' + path)
def main(): setup_logging() parser = argparse.ArgumentParser( description="Create METS files." ) parser.add_argument( "jobs", metavar="J", type=str, nargs="+", help="Heritrix job name" ) parser.add_argument( "-d", dest="dummy", action="store_true", help="Do a dummy run, e.g. don't mint real ARKs.") parser.add_argument( "-w", dest="warcs", help="File containing list of WARC paths." ) parser.add_argument( "-v", dest="viral", help="File containing list of viral WARC paths." ) parser.add_argument( "-l", dest="logs", help="File containing list of log paths." ) parser.add_argument( "-o", dest="output_root", help="Where to put the resulting SIP" ) parser.add_argument( "-I", dest="hash_cache", help="File containing a hash look-up table.") parser.add_argument( "-D", dest="start_date", help="Start date, in '2015-08-27T14:39:19.000000' format (can be truncated).") args = parser.parse_args() client = hdfs.InsecureClient("http://hdfs.gtw.wa.bl.uk:14000/", user="******") sip = SipCreator( args.jobs, start_date=args.start_date, warcs=args.warcs, viral=args.viral, logs=args.logs, dummy_run=args.dummy, hash_cache_file=args.hash_cache, client=client ) sip_dir = "%s/%s" % ( args.output_root, sip.jobname ) sip.create_sip(sip_dir) sip.copy_sip_to_hdfs(sip_dir, sip_dir)
def __init__(self): #Kaizen access_key = "4c3da79d02bb4a2e8f04495bff5203b2" secret_key = "b7bd5b4abcd34ca8a94e93e8b76527f4" s3a_endpoint_url = "https://kzn-swift.massopen.cloud" is_secure = False self.s3client = boto3.client(service_name='s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, endpoint_url=s3a_endpoint_url) self.s3conn = boto.connect_s3( aws_access_key_id=access_key, aws_secret_access_key=secret_key, host=s3a_endpoint_url, calling_format=boto.s3.connection.OrdinaryCallingFormat()) hdfs_endpoint_url = 'http://kariz-1:50070' hdfs_user = '******' self.hdfsclient = hdfs.InsecureClient(hdfs_endpoint_url, user=hdfs_user) self.tpch_metadata, self.tpcds_metadata = inputs.prepare_tpc_metadata() self.tpch_runtime, self.tpcds_runtime = inputs.prepare_tpc_runtimes()
def hdfs_usage(request, root=None): host = settings.HDFS_STROAGE['hosts'] if not root: root = settings.HDFS_STROAGE['HDFS_ROOT'] log.info('http://%s/%s', host, root) client = hdfs.InsecureClient('http://%s' % host) tree = [] try: for appcode in client.list(root): url = 'http://%s/webhdfs/v1%s/%s?op=GETCONTENTSUMMARY' % (host, root, appcode) data = requests.get(url).json() if data['ContentSummary']['directoryCount']: tree.append({ 'name': appcode, 'value': round(data['ContentSummary']['length'] / 1000000.0, 2) }) except HdfsError as e: log.warn("hdfs_usage error: %s", e) return HttpResponseNotFound(dumps(tree), content_type='application/json') return HttpResponse(dumps(tree), content_type='application/json')
def is_hdfs_running(host, admin_user): """Confirm that HDFS is available. There is a pathological case where the HBase master can start up briefly if HDFS is not available, and then quit immediately, but that can be long enough to give a false positive that the HBase master is running. Args: host: HDFS host:port admin_user: Admin username Returns: Boolean """ try: hdfs_client = hdfs.InsecureClient('http://' + host, user=admin_user) LOGGER.info("Contents of HDFS root: {0}".format(hdfs_client.list('/'))) return True except (requests.exceptions.ConnectionError, hdfs.util.HdfsError) as e: msg = 'Could not confirm HDFS is running at http://{0} - {1}'.format(host, e) LOGGER.error(msg) return False
def __init__(self, config): self.config = config self.dateformat = config['dateformat'] self.keep_files = int(config['rotate']) self.now = datetime.datetime.now() self.dateext = self.now.strftime(self.dateformat) self.mode = config['mode'] self.compress = config['compress'] self.user = config['user'] self.group = config['group'] self.sharedscripts = config['sharedscripts'] self.destext = config['destext'] self.copy = config['copy'] self.copytohdfs = config['copytohdfs'] self.prerotates = config['prerotate'] self.postrotates = config['postrotate'] self.hdfs_config = config['hdfs'] self.queuepath = config['queuepath'] self.queue_chunksize = 1000 self.queue_block_timeout = 30 self.queue = Queue(self.queuepath, self.queue_chunksize) self.client = None if self.hdfs_config: self.client = hdfs.InsecureClient(**self.hdfs_config)
def __init__(self, job_id, launch_id): """Takes the checkpoint info and sets up data needed to build the SIP.""" self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) # Set up paths: self.WARC_ROOT = "%s/output/warcs" % HERITRIX_HDFS_ROOT self.VIRAL_ROOT = "%s/output/viral" % HERITRIX_HDFS_ROOT self.IMAGE_ROOT = "%s/output/images" % HERITRIX_HDFS_ROOT self.LOG_ROOT = "%s/output/logs" % HERITRIX_HDFS_ROOT self.LOCAL_LOG_ROOT = "%s/output/logs" % HERITRIX_ROOT self.LOCAL_JOBS_ROOT = "%s/jobs" % HERITRIX_ROOT # self.job_id = job_id self.launch_id = launch_id self.job_launch_id = "%s/%s" % (job_id, launch_id) self.verify_job_launch_id() self.crawl_log = self.get_crawl_log() self.start_date = CrawlJobOutput.file_start_date([self.crawl_log]) # Find the WARCs referenced from the crawl log: self.parse_crawl_log() # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip? # Bundle logs and configuration data into a zip and upload it to HDFS self.upload_logs_as_zip()
#!/usr/bin/python import sys import hdfs current_word = None current_count = 0 word = None unique_count = 0 duplicate_count = 0 client = hdfs.InsecureClient("http://localhost:50070", user="******") # Use uniqs.txt and dups.txt to output corresponding tweets with client.write('uniqs.txt', encoding='utf-8') as uniques, \ client.write('dups.txt', encoding='utf-8') as duplicates: for line in sys.stdin: line = line.strip() # Get the word and the count word, count = line.split('\t', 1) try: count = int(count) except ValueError: continue # If the current word is this word increment counter with its value if current_word == word: current_count += count else: # if unique write to uniq file if dup write to dup file if current_word: