Exemplo n.º 1
0
def _hdfs_initialize():
    global _hdfs_client
    if _hdfs_client is None:
        _hdfs_client = hdfs.InsecureClient(_hdfs_url,
                                           root=_hdfs_root,
                                           user=_hdfs_user_name)
        _hdfs_client.set_permission(_hdfs_root, 777)
Exemplo n.º 2
0
    def hdfs_client(self):
        import hdfs

        logger.debug("HDFSConfig: %s", self.hdfscli_config)

        try:
            return hdfs.config.Config(self.hdfscli_config).get_client(
                self.alias)
        except hdfs.util.HdfsError as exc:
            exc_msg = str(exc)
            errors = (
                "No alias specified",
                "Invalid configuration file",
                f"Alias {self.alias} not found",
            )
            if not any(err in exc_msg for err in errors):
                raise

            http_url = f"http://{self.host}:{self.port}"
            logger.debug("URL: %s", http_url)

            if self.token is not None:
                client = hdfs.TokenClient(http_url, token=self.token, root="/")
            else:
                client = hdfs.InsecureClient(http_url,
                                             user=self.user,
                                             root="/")

        return client
Exemplo n.º 3
0
    def __init__(self, host=None, port=None, user=None):
        host = self.get_config('namenode_host') if host is None else host
        port = self.get_config('namenode_port') if port is None else port
        user = self.get_config('user') if user is None else os.environ['USER']

        url = 'http://' + host + ':' + port
        self.webhdfs = webhdfs.InsecureClient(url=url, user=user)
Exemplo n.º 4
0
    def __init__(self, config):
        self.paths = config['paths']

        self.mode = int(config['mode'], 8)
        self.user = config['user']
        self.group = config['group']

        # FIXME: Handle rotated files keeping correctly
        # self.keep_files = int(config['rotate'])
        self.compress = config['compress']

        self.copy = config['copy']
        self.copytohdfs = config['copytohdfs']
        self.hdfs_config = config['hdfs']
        self.hdfs_client = None
        if self.hdfs_config:
            self.hdfs_client = hdfs.InsecureClient(**self.hdfs_config)

        self.dateformat = config['dateformat']
        self.now = datetime.datetime.now()
        self.timestamp = self.now.strftime(self.dateformat)
        self.destext = config['destext']

        self.fnformat = config['fnformat']
        if not self.fnformat:
            raise ValueError("'fnformat' cannot be empty")

        self.sharedscripts = config['sharedscripts']
        self.prerotates = config['prerotate']
        self.postrotates = config['postrotate']

        self.queuepath = config['queuepath']
        self.queue_chunksize = 1000
        self.queue_block_timeout = 30
        self.queue = Queue(self.queuepath, self.queue_chunksize)
Exemplo n.º 5
0
def lec(adresseHdfs, repDataHdfs):
    schema = {
        "namespace":
        "ffo.hashtag",
        "type":
        "record",
        "name":
        "Node",
        "fields": [
            {
                "name": "datehashtag",
                "type": "string"
            },
            {
                "name": "timestamp",
                "type": "int"
            },
            {
                "name": "hashtags",
                "type": {
                    "type": "array",
                    "items": "string"
                },
                "default": {}
            },
        ]
    }

    hdfs_client = hdfs.InsecureClient(adresseHdfs, schema)
    with hdfs_client.read(repDataHdfs) as of:
        reader = fastavro.reader(of)
        for node in reader:
            print(node)
Exemplo n.º 6
0
 def __init__( self, jobs, jobname=datetime.now().strftime( "%Y%m%d%H%M%S" ), warcs=None, viral=None, logs=None, start_date=None, dummy_run=False, hash_cache_file=None, client=None ):
     """Sets up fields."""
     if client is None:
         self.client = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user'))
     else:
         self.client = client
     self.dummy = dummy_run
     if self.dummy:
         logger.info("This is a dummy-run - no real ARKs will be minted.")
     self.overwrite = False
     self.jobs = jobs
     self.jobname = jobname
     self.warcs = warcs
     self.viral = viral
     self.logs = logs
     self.hash_cache = {}
     self.parse_hash_cache(hash_cache_file)
     self.startdate = start_date
     self.BAGIT_CONTACT_NAME="Andrew N. Jackson"
     self.BAGIT_CONTACT_EMAIL="*****@*****.**"
     self.BAGIT_DESCRIPTION="LD Crawl: "
     self.ARK_URL="http://pii.ad.bl.uk/pii/vdc?arks="
     self.ARK_PREFIX="ark:/81055/vdc_100022535899.0x"
     # And create:
     logger.info("Processing job files...")
     self.processJobs()
     logger.info("Generating METS...")
     self.createMets()
Exemplo n.º 7
0
 def __init__(self, queue, hdfs_base_url, path):
     self.queue = queue
     self.client = hdfs.InsecureClient(hdfs_base_url)
     self.base_path = path
     self.thread = threading.Thread(target=self.run, args=())
     self.thread.daemon = True
     self.thread.start()
Exemplo n.º 8
0
    def __init__(self, pipeline_options):
        """Initializes a connection to HDFS.

    Connection configuration is done by passing pipeline options.
    See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`.
    """
        super(HadoopFileSystem, self).__init__(pipeline_options)
        logging.getLogger('hdfs.client').setLevel(logging.WARN)
        if pipeline_options is None:
            raise ValueError('pipeline_options is not set')
        if isinstance(pipeline_options, PipelineOptions):
            hdfs_options = pipeline_options.view_as(HadoopFileSystemOptions)
            hdfs_host = hdfs_options.hdfs_host
            hdfs_port = hdfs_options.hdfs_port
            hdfs_user = hdfs_options.hdfs_user
        else:
            hdfs_host = pipeline_options.get('hdfs_host')
            hdfs_port = pipeline_options.get('hdfs_port')
            hdfs_user = pipeline_options.get('hdfs_user')

        if hdfs_host is None:
            raise ValueError('hdfs_host is not set')
        if hdfs_port is None:
            raise ValueError('hdfs_port is not set')
        if hdfs_user is None:
            raise ValueError('hdfs_user is not set')
        self._hdfs_client = hdfs.InsecureClient('http://%s:%s' %
                                                (hdfs_host, str(hdfs_port)),
                                                user=hdfs_user)
Exemplo n.º 9
0
 def __init__(self):
     # 对于WebHDFS的读写,在不考虑Kuberos的情况下,需要将/etc/hosts文件配置和Hadoop集群一样的IP
     self.HDFSClient = hdfs.InsecureClient(WebHDFSAddr,
                                           root='/',
                                           user="******")
     self.localInputPathTemp = localInputPathTemp
     self.localOutputPathTemp = localOutputPathTemp
     self.NginxAddr = NginxAddr
Exemplo n.º 10
0
 def client(self):
     # A naive benchmark showed that 1000 existence checks took 2.5 secs
     # when not recreating the client, and 4.0 secs when recreating it. So
     # not urgent to memoize it. Note that it *might* be issues with process
     # forking and whatnot (as the one in the snakebite client) if we
     # memoize it too trivially.
     import hdfs
     return hdfs.InsecureClient(url=self.url, user=self.user)
Exemplo n.º 11
0
def get_all_identifiers(sip):
    """Parses the SIP in HDFS and retrieves FILE/ARK tuples."""
#    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user'))
    client = hdfs.InsecureClient(HDFS_URL, HDFS_USER)
    tar = "%s/%s.tar.gz" % (SIP_ROOT, sip)
    status = client.status(tar,strict=False)
    if status:
        # Catch empty packages:
        if status['length'] == 0:
            logger.warning("Empty (zero byte) SIP package: %s" % tar)
            yield None
        else:
            with client.read(tar) as reader:
                t = reader.read()
                # Open the package:
                tar = tarfile.open(mode="r:gz", fileobj=StringIO(t))
                foundMets = False
                for i in tar.getmembers():
                    logger.debug("Examining %s" % i.name)
                    if i.name.endswith(".xml"):
                        foundMets = True
                        xml = tar.extractfile(i).read()
                        try:
                            tree = etree.fromstring(xml)
                            files = {}
                            n_files = 0
                            for mfile in tree.xpath("//mets:file", namespaces=NS):
                                #logger.debug("Found mets:file = %s " % etree.tostring(mfile))
                                admid = mfile.attrib["ADMID"]
                                logger.info("Found mets:file admid = %s " % admid)
                                path = mfile.xpath("mets:FLocat", namespaces=NS)[0].attrib["%shref" % XLINK]
                                files[admid] = { "path": path, "mimetype": mfile.attrib["MIMETYPE"], "size": mfile.attrib["SIZE"],
                                        "checksum_type": mfile.attrib["CHECKSUMTYPE"], "checksum": mfile.attrib["CHECKSUM"] }
                                n_files = n_files + 1
                            if len(files.keys()) != n_files:
                                logger.error("ERROR, more files than IDs")
                            n_amdsecs = 0
                            for amdsec in tree.xpath("//mets:amdSec", namespaces=NS):
                                #logger.debug("Found mets:amdSec = %s " % etree.tostring(amdsec))
                                admid = amdsec.attrib["ID"]
                                logger.info("Found mets:amdSec id = %s " % admid)
                                oiv = amdsec.xpath("mets:digiprovMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=NS)
                                if oiv and len(oiv) == 1:
                                    files[admid]['ark'] = oiv[0].text
                                    n_amdsecs = n_amdsecs + 1
                                    logger.debug("Yielding %s" % files[admid] )
                                    yield files[admid]
                                else:
                                    logger.info("Skipping amdSec ID=%s" % admid)
                            if n_files != n_amdsecs:
                                logger.error("ERROR finding all amdSec elements")
                        except IndexError as i:
                            logger.error("Problem parsing METS for SIP: %s" % sip)
                            logger.exception(i)
                if not foundMets:
                    logger.error("No METS XML file found!")
    else:
        logger.warning("Could not find SIP: hdfs://%s" % tar)
Exemplo n.º 12
0
 def __get_connection(self):
     try:
         self.__hdfs_client = hdfs.InsecureClient(
             "http://" + self.__active_nn + ":" + self.__port,
             user=self.__hdfs_user)
         logger.debug("established connection with hdfs!")
     except Exception as error:
         logger.error("cannot establish connection with hdfs!")
         raise error
Exemplo n.º 13
0
def main(number,workers):
    spark = SparkSession\
        .builder\
        .appName("Locomotion-Extract-TDMS")\
        .getOrCreate()
    sc = spark.sparkContext
    hdfs.InsecureClient(SERVER).makedirs('/generated/output'+number)
    files = glob.glob('generated/input'+number+'/**/*.tdms')
    g = sc.parallelize(files, workers).map(lambda f : transform_and_store(f,number)).collect()
    spark.stop()
Exemplo n.º 14
0
def do_hdfs_upload(myid, url, safe_url, keyhash, fname, fdata, ftag):
    # enforce teh file name to be the public hash prefixed or sealed prefix

    if not fname.startsWith("/" +
                            keyhash) or not fname.startsWith(sealed_prefix):
        fname = "/%s/%s" % (keyhash, fname)

    # create hdfs client on demand now. No need to optimize for PoC
    hdfs_client = hdfs.InsecureClient(url, user=ftag)
    return hdfs_client.write(fname, fdata, overwrite=True, permission=666)
Exemplo n.º 15
0
def do_hdfs_upload(url, keyhash, fname, fdata, ftag):
    # enforce teh file name to be the public hash prefixed or sealed prefix
    if not fname.startswith("/" +
                            keyhash) or not fname.startswith(sealed_prefix):
        fname = "/%s/%s" % (keyhash, fname)
    dirname = os.path.dirname(fname)
    prepare_user_dir(url, dirname, keyhash)

    # create hdfs client on demand now. No need to optimize for PoC
    hdfs_client = hdfs.InsecureClient(url, user=ftag)
    hdfs_client.write(fname, fdata, overwrite=True, permission=666)
    return "file written"
    def create(self):
        """
        Creates webhdfs client instance.
        Concrete implementation depends on a client_type parameter,
        if it's kerberos, then KerberosClient is created, otherwise InsecureClient.

        :return hdfs client:
        """
        if self.client_type == 'kerberos':
            from hdfs.ext.kerberos import KerberosClient
            return KerberosClient(url=self.url)
        else:
            return hdfs.InsecureClient(url=self.url, user=self.user)
Exemplo n.º 17
0
    def client_and_path(path):
        _, domain, folder_path, file_pattern = parse_file_uri(path)

        if ':' not in domain:
            port = 50070
        else:
            domain, port = domain.split(':')
            port = int(port)
        cache_id = domain + '__' + str(port)

        if cache_id not in Hdfs._conn:
            Hdfs._conn[cache_id] = hdfs.InsecureClient(  # pylint: disable=no-member
                'http://{0}:{1}'.format(domain, port))
        return Hdfs._conn[cache_id], folder_path + file_pattern
Exemplo n.º 18
0
def find_identifiers(output_file):
    with open(output_file, 'w') as f:
        client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user'))
        for (path, dirs, files) in client.walk(SIP_ROOT):
            logger.info("Looking at path "+path)
            for file in files:
                logger.info("Looking at file " + file)
                if file.endswith('.tar.gz'):
                    sip = "%s/%s" % (path, file)
                    sip = sip[len(SIP_ROOT) + 1:]
                    sip = sip[:-7]
                    logger.info("Scanning %s..." % sip)
                    for waid in get_all_identifiers(sip):
                        f.write("%s %s\n" % (sip, waid) )
Exemplo n.º 19
0
def calculateHash(path):
    logger.info("Starting to generate hash for %s" % path)
    client = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                 user=cfg.get('hdfs', 'user'))
    sha = hashlib.sha512()
    with client.read(path) as file:
        while True:
            data = file.read(10485760)
            if not data:
                file.close()
                break
            sha.update(data)
    logger.info("Finished generating hash for %s" % path)
    return sha.hexdigest()
Exemplo n.º 20
0
def calculateHash(path, client=None):
    if client is None:
        client = hdfs.InsecureClient('http://hdfs.gtw.wa.bl.uk:14000',
                                     user='******')
    logger.info("Starting to generate hash for %s" % path)
    sha = hashlib.sha512()
    with client.read(path) as file:
        while True:
            data = file.read(10485760)
            if not data:
                file.close()
                break
            sha.update(data)
    logger.info("Finished generating hash for %s" % path)
    return sha.hexdigest()
Exemplo n.º 21
0
    def __init__(self,
                 date,
                 warcs,
                 viral,
                 logs,
                 identifiers,
                 hash_cache=None,
                 client=None):
        if client is None:
            self.client = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                              user=cfg.get('hdfs', 'user'))
        else:
            self.client = client
        self.warcs = []
        self.viral = []
        self.date = date
        self.wq = Queue()
        self.vq = Queue()
        self.hash_cache = hash_cache

        for i in range(NUM_THREADS):
            worker = Thread(target=create_warcs,
                            args=(self.wq, self.warcs, self))
            worker.setDaemon(True)
            worker.start()

        for warc in warcs:
            self.wq.put(warc)
        self.wq.join()

        for i in range(NUM_THREADS):
            worker = Thread(target=create_warcs,
                            args=(self.vq, self.viral, self))
            worker.setDaemon(True)
            worker.start()

        for warc in viral:
            self.vq.put(warc)
        self.vq.join()

        self.logs = []
        for log in logs:
            self.logs.append(ZipContainer(path=log, parent=self))
        self.identifiers = identifiers
        self.createDomainMets()
        self.createCrawlerMets()
Exemplo n.º 22
0
    def client_and_path(path):
        _, domain, folder_path, file_pattern = parse_file_uri(path)

        if ':' not in domain:
            port = 50070
        else:
            domain, port = domain.split(':')
            port = int(port)
        cache_id = domain + '__' + str(port)

        if cache_id not in Hdfs._conn:
            if hdfs is None:
                raise FileSystemNotSupported(
                    'hdfs not supported. Install the python package "hdfs".')
            Hdfs._conn[cache_id] = hdfs.InsecureClient(  # pylint: disable=no-member
                f'http://{domain}:{port}')
        return Hdfs._conn[cache_id], folder_path + file_pattern
Exemplo n.º 23
0
    def client_and_path(path):
        # obtain key
        t = Tokenizer(path)
        t.next('://')  # skip scheme
        domain = t.next('/')
        path = t.next()

        if ':' not in domain:
            port = 50070
        else:
            domain, port = domain.split(':')
            port = int(port)
        cache_id = domain + '__' + str(port)

        if cache_id not in Hdfs._conn:
            Hdfs._conn[cache_id] = hdfs.InsecureClient('http://{0}:{1}'.format(
                domain, port))
        return (Hdfs._conn[cache_id], '/' + path)
Exemplo n.º 24
0
def main():
    setup_logging()

    parser = argparse.ArgumentParser( description="Create METS files." )
    parser.add_argument( "jobs", metavar="J", type=str, nargs="+", help="Heritrix job name" )
    parser.add_argument( "-d", dest="dummy", action="store_true", help="Do a dummy run, e.g. don't mint real ARKs.")
    parser.add_argument( "-w", dest="warcs", help="File containing list of WARC paths." )
    parser.add_argument( "-v", dest="viral", help="File containing list of viral WARC paths." )
    parser.add_argument( "-l", dest="logs", help="File containing list of log paths." )
    parser.add_argument( "-o", dest="output_root", help="Where to put the resulting SIP" )
    parser.add_argument( "-I", dest="hash_cache", help="File containing a hash look-up table.")
    parser.add_argument( "-D", dest="start_date", help="Start date, in '2015-08-27T14:39:19.000000' format (can be truncated).")
    args = parser.parse_args()

    client = hdfs.InsecureClient("http://hdfs.gtw.wa.bl.uk:14000/", user="******")

    sip = SipCreator( args.jobs, start_date=args.start_date, warcs=args.warcs, viral=args.viral, logs=args.logs, dummy_run=args.dummy, hash_cache_file=args.hash_cache, client=client )
    sip_dir = "%s/%s" % ( args.output_root, sip.jobname )
    sip.create_sip(sip_dir)
    sip.copy_sip_to_hdfs(sip_dir, sip_dir)
Exemplo n.º 25
0
 def __init__(self):
     #Kaizen
     access_key = "4c3da79d02bb4a2e8f04495bff5203b2"
     secret_key = "b7bd5b4abcd34ca8a94e93e8b76527f4"
     s3a_endpoint_url = "https://kzn-swift.massopen.cloud"
     is_secure = False
     self.s3client = boto3.client(service_name='s3',
                                  aws_access_key_id=access_key,
                                  aws_secret_access_key=secret_key,
                                  endpoint_url=s3a_endpoint_url)
     self.s3conn = boto.connect_s3(
         aws_access_key_id=access_key,
         aws_secret_access_key=secret_key,
         host=s3a_endpoint_url,
         calling_format=boto.s3.connection.OrdinaryCallingFormat())
     hdfs_endpoint_url = 'http://kariz-1:50070'
     hdfs_user = '******'
     self.hdfsclient = hdfs.InsecureClient(hdfs_endpoint_url,
                                           user=hdfs_user)
     self.tpch_metadata, self.tpcds_metadata = inputs.prepare_tpc_metadata()
     self.tpch_runtime, self.tpcds_runtime = inputs.prepare_tpc_runtimes()
Exemplo n.º 26
0
def hdfs_usage(request, root=None):
    host = settings.HDFS_STROAGE['hosts']
    if not root:
        root = settings.HDFS_STROAGE['HDFS_ROOT']

    log.info('http://%s/%s', host, root)
    client = hdfs.InsecureClient('http://%s' % host)
    tree = []
    try:
        for appcode in client.list(root):
            url = 'http://%s/webhdfs/v1%s/%s?op=GETCONTENTSUMMARY' % (host, root, appcode)
            data = requests.get(url).json()
            if data['ContentSummary']['directoryCount']:
                tree.append({
                    'name': appcode,
                    'value': round(data['ContentSummary']['length'] / 1000000.0, 2)
                })
    except HdfsError as e:
        log.warn("hdfs_usage error: %s", e)
        return HttpResponseNotFound(dumps(tree), content_type='application/json')
    return HttpResponse(dumps(tree), content_type='application/json')
Exemplo n.º 27
0
def is_hdfs_running(host, admin_user):
    """Confirm that HDFS is available.

    There is a pathological case where the HBase master can start up briefly if HDFS is not
    available, and then quit immediately, but that can be long enough to give a false positive
    that the HBase master is running.

    Args:
        host: HDFS host:port
        admin_user: Admin username

    Returns:
        Boolean
    """
    try:
        hdfs_client = hdfs.InsecureClient('http://' + host, user=admin_user)
        LOGGER.info("Contents of HDFS root: {0}".format(hdfs_client.list('/')))
        return True
    except (requests.exceptions.ConnectionError, hdfs.util.HdfsError) as e:
        msg = 'Could not confirm HDFS is running at http://{0} - {1}'.format(host, e)
        LOGGER.error(msg)
        return False
Exemplo n.º 28
0
 def __init__(self, config):
     self.config = config
     self.dateformat = config['dateformat']
     self.keep_files = int(config['rotate'])
     self.now = datetime.datetime.now()
     self.dateext = self.now.strftime(self.dateformat)
     self.mode = config['mode']
     self.compress = config['compress']
     self.user = config['user']
     self.group = config['group']
     self.sharedscripts = config['sharedscripts']
     self.destext = config['destext']
     self.copy = config['copy']
     self.copytohdfs = config['copytohdfs']
     self.prerotates = config['prerotate']
     self.postrotates = config['postrotate']
     self.hdfs_config = config['hdfs']
     self.queuepath = config['queuepath']
     self.queue_chunksize = 1000
     self.queue_block_timeout = 30
     self.queue = Queue(self.queuepath, self.queue_chunksize)
     self.client = None
     if self.hdfs_config:
         self.client = hdfs.InsecureClient(**self.hdfs_config)
Exemplo n.º 29
0
    def __init__(self, job_id, launch_id):
        """Takes the checkpoint info and sets up data needed to build the SIP."""
        self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'),
                                        user=cfg.get('hdfs', 'user'))
        # Set up paths:
        self.WARC_ROOT = "%s/output/warcs" % HERITRIX_HDFS_ROOT
        self.VIRAL_ROOT = "%s/output/viral" % HERITRIX_HDFS_ROOT
        self.IMAGE_ROOT = "%s/output/images" % HERITRIX_HDFS_ROOT
        self.LOG_ROOT = "%s/output/logs" % HERITRIX_HDFS_ROOT
        self.LOCAL_LOG_ROOT = "%s/output/logs" % HERITRIX_ROOT
        self.LOCAL_JOBS_ROOT = "%s/jobs" % HERITRIX_ROOT

        #
        self.job_id = job_id
        self.launch_id = launch_id
        self.job_launch_id = "%s/%s" % (job_id, launch_id)
        self.verify_job_launch_id()
        self.crawl_log = self.get_crawl_log()
        self.start_date = CrawlJobOutput.file_start_date([self.crawl_log])
        # Find the WARCs referenced from the crawl log:
        self.parse_crawl_log()
        # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip?
        # Bundle logs and configuration data into a zip and upload it to HDFS
        self.upload_logs_as_zip()
Exemplo n.º 30
0
#!/usr/bin/python

import sys
import hdfs

current_word = None
current_count = 0
word = None
unique_count = 0
duplicate_count = 0

client = hdfs.InsecureClient("http://localhost:50070", user="******")
# Use uniqs.txt and dups.txt to output corresponding tweets
with client.write('uniqs.txt', encoding='utf-8') as uniques, \
        client.write('dups.txt', encoding='utf-8') as duplicates:
    for line in sys.stdin:
        line = line.strip()
        # Get the word and the count
        word, count = line.split('\t', 1)

        try:
            count = int(count)
        except ValueError:
            continue

        # If the current word is this word increment counter with its value
        if current_word == word:
            current_count += count
        else:
            # if unique write to uniq file if dup write to dup file
            if current_word: