def __init__( self, jobs, jobname=datetime.now().strftime( "%Y%m%d%H%M%S" ), warcs=None, viral=None, logs=None, start_date=None, dummy_run=False, hash_cache_file=None, client=None ): """Sets up fields.""" if client is None: self.client = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) else: self.client = client self.dummy = dummy_run if self.dummy: logger.info("This is a dummy-run - no real ARKs will be minted.") self.overwrite = False self.jobs = jobs self.jobname = jobname self.warcs = warcs self.viral = viral self.logs = logs self.hash_cache = {} self.parse_hash_cache(hash_cache_file) self.startdate = start_date self.BAGIT_CONTACT_NAME="Andrew N. Jackson" self.BAGIT_CONTACT_EMAIL="*****@*****.**" self.BAGIT_DESCRIPTION="LD Crawl: " self.ARK_URL="http://pii.ad.bl.uk/pii/vdc?arks=" self.ARK_PREFIX="ark:/81055/vdc_100022535899.0x" # And create: logger.info("Processing job files...") self.processJobs() logger.info("Generating METS...") self.createMets()
def get_all_identifiers(sip): """Parses the SIP in HDFS and retrieves FILE/ARK tuples.""" client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) tar = "%s/%s.tar.gz" % (SIP_ROOT, sip) status = client.status(tar,strict=False) if status: # Catch empty packages: if status['length'] == 0: logger.warning("Empty (zero byte) SIP package: %s" % tar) yield None else: with client.read(tar) as reader: t = reader.read() # Open the package: tar = tarfile.open(mode="r:gz", fileobj=StringIO(t)) foundMets = False for i in tar.getmembers(): logger.debug("Examining %s" % i.name) if i.name.endswith(".xml"): foundMets = True xml = tar.extractfile(i).read() try: tree = etree.fromstring(xml) files = {} n_files = 0 for mfile in tree.xpath("//mets:file", namespaces=NS): #logger.debug("Found mets:file = %s " % etree.tostring(mfile)) admid = mfile.attrib["ADMID"] logger.info("Found mets:file admid = %s " % admid) path = mfile.xpath("mets:FLocat", namespaces=NS)[0].attrib["%shref" % XLINK] files[admid] = { "path": path, "mimetype": mfile.attrib["MIMETYPE"], "size": mfile.attrib["SIZE"], "checksum_type": mfile.attrib["CHECKSUMTYPE"], "checksum": mfile.attrib["CHECKSUM"] } n_files = n_files + 1 if len(files.keys()) != n_files: logger.error("ERROR, more files than IDs") n_amdsecs = 0 for amdsec in tree.xpath("//mets:amdSec", namespaces=NS): #logger.debug("Found mets:amdSec = %s " % etree.tostring(amdsec)) admid = amdsec.attrib["ID"] logger.info("Found mets:amdSec id = %s " % admid) oiv = amdsec.xpath("mets:digiprovMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=NS) if oiv and len(oiv) == 1: files[admid]['ark'] = oiv[0].text n_amdsecs = n_amdsecs + 1 logger.debug("Yielding %s" % files[admid] ) yield files[admid] else: logger.info("Skipping amdSec ID=%s" % admid) if n_files != n_amdsecs: logger.error("ERROR finding all amdSec elements") except IndexError as i: logger.error("Problem parsing METS for SIP: %s" % sip) logger.exception(i) if not foundMets: logger.error("No METS XML file found!") else: logger.warning("Could not find SIP: hdfs://%s" % tar)
def uri_of_doc(self, **kwargs): try: logger.info("Got doc to send to W3ACT for: %s" % kwargs) # Set up connection to W3ACT: w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password')) # And post this document up: send_document_to_w3act(kwargs,cfg.get('wayback','endpoint'),w) except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def calculateHash( path ): logger.info("Starting to generate hash for %s" % path) client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) sha = hashlib.sha512() with client.read(path) as file: while True: data = file.read( 10485760 ) if not data: file.close() break sha.update( data ) logger.info("Finished generating hash for %s" % path) return sha.hexdigest()
def uri_of_doc(self, **kwargs): try: logger.info("Got doc to send to W3ACT for: %s" % kwargs) # Set up connection to W3ACT: w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'), cfg.get('act', 'password')) # And post this document up: send_document_to_w3act(kwargs, cfg.get('wayback', 'endpoint'), w) except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def find_identifiers(output_file): with open(output_file, 'w') as f: client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) for (path, dirs, files) in client.walk(SIP_ROOT): logger.info("Looking at path "+path) for file in files: logger.info("Looking at file " + file) if file.endswith('.tar.gz'): sip = "%s/%s" % (path, file) sip = sip[len(SIP_ROOT) + 1:] sip = sip[:-7] logger.info("Scanning %s..." % sip) for waid in get_all_identifiers(sip): f.write("%s %s\n" % (sip, waid) )
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True): """ Restarts the job for a particular frequency. """ try: logger.info("Stopping/starting %s at %s" % (frequency, start)) # Set up connection to W3ACT: w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password')) # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3','host'), cfg.get('h3','port')), username=cfg.get('h3','username'), password=cfg.get('h3','password')) # Stop job if currently running: if frequency in h.list_jobs() and h.status(frequency) != "": """Stops a running job, notifies RabbitMQ and cleans up the directory.""" launch_id = h.get_launch_id(frequency) job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h) job.stop() remove_action_files(frequency) crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "STOPPED") # Pass on to the next step in the chain: logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id)) assemble_job_output.delay(frequency,launch_id) else: job = None # Start job if requested: if restart: targets = w.get_ld_export(frequency) # logger.info("Found %s Targets in export." % len(export)) # targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)] logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(w, targets, frequency, heritrix=h) logger.info("Starting job %s..." % job.name) job.start() launch_id = h.get_launch_id(frequency) crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED" ) logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)) else: if job: logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id)) return "Stopped job %s/%s without restarting..." % (job.name, launch_id) else: logger.warning("No running '%s' job to stop!" % frequency) return "No running '%s' job to stop!" % frequency except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def calculateHash(path): logger.info("Starting to generate hash for %s" % path) client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) sha = hashlib.sha512() with client.read(path) as file: while True: data = file.read(10485760) if not data: file.close() break sha.update(data) logger.info("Finished generating hash for %s" % path) return sha.hexdigest()
def __init__(self, date, warcs, viral, logs, identifiers, hash_cache=None, client=None): if client is None: self.client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) else: self.client = client self.warcs = [] self.viral = [] self.date = date self.wq = Queue() self.vq = Queue() self.hash_cache = hash_cache for i in range(NUM_THREADS): worker = Thread(target=create_warcs, args=(self.wq, self.warcs, self)) worker.setDaemon(True) worker.start() for warc in warcs: self.wq.put(warc) self.wq.join() for i in range(NUM_THREADS): worker = Thread(target=create_warcs, args=(self.vq, self.viral, self)) worker.setDaemon(True) worker.start() for warc in viral: self.vq.put(warc) self.vq.join() self.logs = [] for log in logs: self.logs.append(ZipContainer(path=log, parent=self)) self.identifiers = identifiers self.createDomainMets() self.createCrawlerMets()
def uri_to_index(self, **kwargs): try: logger.debug("Got URI to index: %s" % kwargs) send_uri_to_tinycdxserver(cfg.get('tinycdxserver','endpoint'), kwargs) except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def uri_to_index(self, **kwargs): try: logger.debug("Got URI to index: %s" % kwargs) send_uri_to_tinycdxserver(cfg.get('tinycdxserver', 'endpoint'), kwargs) except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def __init__(self, job_id, launch_id): """Takes the checkpoint info and sets up data needed to build the SIP.""" self.hdfs = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) # Set up paths: self.WARC_ROOT = "%s/output/warcs" % HERITRIX_HDFS_ROOT self.VIRAL_ROOT = "%s/output/viral" % HERITRIX_HDFS_ROOT self.IMAGE_ROOT = "%s/output/images" % HERITRIX_HDFS_ROOT self.LOG_ROOT = "%s/output/logs" % HERITRIX_HDFS_ROOT self.LOCAL_LOG_ROOT = "%s/output/logs" % HERITRIX_ROOT self.LOCAL_JOBS_ROOT = "%s/jobs" % HERITRIX_ROOT # self.job_id = job_id self.launch_id = launch_id self.job_launch_id = "%s/%s" % (job_id, launch_id) self.verify_job_launch_id() self.crawl_log = self.get_crawl_log() self.start_date = CrawlJobOutput.file_start_date([self.crawl_log]) # Find the WARCs referenced from the crawl log: self.parse_crawl_log() # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip? # Bundle logs and configuration data into a zip and upload it to HDFS self.upload_logs_as_zip()
def __init__(self, job_id, launch_id): """Takes the checkpoint info and sets up data needed to build the SIP.""" self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) # Set up paths: self.WARC_ROOT = "%s/output/warcs" % HERITRIX_HDFS_ROOT self.VIRAL_ROOT = "%s/output/viral" % HERITRIX_HDFS_ROOT self.IMAGE_ROOT = "%s/output/images" % HERITRIX_HDFS_ROOT self.LOG_ROOT = "%s/output/logs" % HERITRIX_HDFS_ROOT self.LOCAL_LOG_ROOT = "%s/output/logs" % HERITRIX_ROOT self.LOCAL_JOBS_ROOT = "%s/jobs" % HERITRIX_ROOT # self.job_id = job_id self.launch_id = launch_id self.job_launch_id = "%s/%s" % (job_id, launch_id) self.verify_job_launch_id() self.crawl_log = self.get_crawl_log() self.start_date = CrawlJobOutput.file_start_date([self.crawl_log]) # Find the WARCs referenced from the crawl log: self.parse_crawl_log() # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip? # Bundle logs and configuration data into a zip and upload it to HDFS self.upload_logs_as_zip()
def __init__( self, date, warcs, viral, logs, identifiers, hash_cache=None, client=None ): if client is None: self.client= hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) else: self.client = client self.warcs = [] self.viral = [] self.date = date self.wq = Queue() self.vq = Queue() self.hash_cache = hash_cache for i in range(NUM_THREADS): worker = Thread(target=create_warcs, args=(self.wq, self.warcs, self)) worker.setDaemon(True) worker.start() for warc in warcs: self.wq.put(warc) self.wq.join() for i in range(NUM_THREADS): worker = Thread(target=create_warcs, args=(self.vq, self.viral, self)) worker.setDaemon(True) worker.start() for warc in viral: self.vq.put(warc) self.vq.join() self.logs = [] for log in logs: self.logs.append( ZipContainer( path=log, parent=self )) self.identifiers = identifiers self.createDomainMets() self.createCrawlerMets()
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True): """ Restarts the job for a particular frequency. """ try: logger.info("Stopping/starting %s at %s" % (frequency, start)) # Set up connection to W3ACT: w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'), cfg.get('act', 'password')) # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3', 'host'), cfg.get('h3', 'port')), username=cfg.get('h3', 'username'), password=cfg.get('h3', 'password')) # Stop job if currently running: if frequency in h.list_jobs() and h.status(frequency) != "": """Stops a running job, notifies RabbitMQ and cleans up the directory.""" launch_id = h.get_launch_id(frequency) job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h) job.stop() remove_action_files(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "STOPPED") # Pass on to the next step in the chain: logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id)) assemble_job_output.delay(frequency, launch_id) else: job = None # Start job if requested: if restart: targets = w.get_ld_export(frequency) # logger.info("Found %s Targets in export." % len(export)) # targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)] logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(w, targets, frequency, heritrix=h) logger.info("Starting job %s..." % job.name) job.start() launch_id = h.get_launch_id(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED") logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)) else: if job: logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id)) return "Stopped job %s/%s without restarting..." % (job.name, launch_id) else: logger.warning("No running '%s' job to stop!" % frequency) return "No running '%s' job to stop!" % frequency except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def __init__(self, job_id, launch_id, sip_tgz): self.hdfs = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) self.submit_sip(launch_id,sip_tgz)
from __future__ import absolute_import import os import bagit import tarfile import hdfs import shutil # import the Celery app context from crawl.celery import app from crawl.celery import cfg # Set up drop/watched folder configuration DLS_DROP=cfg.get('dls','drop_folder') DLS_WATCH=cfg.get('dls','watch_folder') # import the Celery log getter and use it from celery.utils.log import get_task_logger logger = get_task_logger(__name__) # class SubmitSip(): def __init__(self, job_id, launch_id, sip_tgz): self.hdfs = hdfs.InsecureClient(cfg.get('hdfs','url'), user=cfg.get('hdfs','user')) self.submit_sip(launch_id,sip_tgz) def submit_sip(self, job_id, sip_tgz): """ Download, unpack, check and submit the specified SIP tar.gz file (from HDFS)
def getLength( path ): client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) status = client.status(path) return status['length']
from __future__ import absolute_import import os import bagit import tarfile import hdfs import shutil # import the Celery app context from crawl.celery import app from crawl.celery import cfg # Set up drop/watched folder configuration DLS_DROP = cfg.get('dls', 'drop_folder') DLS_WATCH = cfg.get('dls', 'watch_folder') # import the Celery log getter and use it from celery.utils.log import get_task_logger logger = get_task_logger(__name__) # class SubmitSip(): def __init__(self, job_id, launch_id, sip_tgz): self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) self.submit_sip(launch_id, sip_tgz) def submit_sip(self, job_id, sip_tgz): """ Download, unpack, check and submit the specified SIP tar.gz file (from HDFS)
def getLength(path): client = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) status = client.status(path) return status['length']
def __init__(self, job_id, launch_id, sip_tgz): self.hdfs = hdfs.InsecureClient(cfg.get('hdfs', 'url'), user=cfg.get('hdfs', 'user')) self.submit_sip(launch_id, sip_tgz)