def run(self): # If so, lookup Target and extract any additional metadata: targets = json.load(self.input()['targets'].open('r')) doc = DocumentMDEx(targets, self.doc.get_wrapped().copy(), self.source).mdex() # Documents may be rejected at this point: if doc is None: logger.critical("The document %s has been REJECTED!" % self.doc['document_url']) doc = self.doc.get_wrapped().copy() doc['status'] = 'REJECTED' else: # Inform W3ACT it's available: doc['status'] = 'ACCEPTED' logger.debug("Sending doc: %s" % doc) w = w3act(act().url, act().username, act().password) r = w.post_document(doc) if r.status_code == 200: logger.info("Document POSTed to W3ACT: %s" % doc['document_url']) else: logger.error("Failed with %s %s\n%s" % (r.status_code, r.reason, r.text)) raise Exception("Failed with %s %s\n%s" % (r.status_code, r.reason, r.text)) #yield AvailableInWayback(doc['document_url'], doc['wayback_timestamp'], check_available=True) # And write out to the status file with self.output().open('w') as out_file: out_file.write('{}'.format(json.dumps(doc, indent=4))) # Also post to Monitrix if configured to do so: if systems().elasticsearch_host: yield RecordDocumentInMonitrix(self.job, self.launch_id, doc, self.source)
def run(self): # If so, lookup Target and extract any additional metadata: targets = json.load(self.input()['targets'].open('r')) doc = DocumentMDEx(targets, self.doc.get_wrapped().copy(), self.source).mdex() # Documents may be rejected at this point: if doc is None: logger.critical("The document %s has been REJECTED!" % self.doc['document_url']) doc = self.doc.get_wrapped().copy() doc['status'] = 'REJECTED' else: # Inform W3ACT it's available: doc['status'] = 'ACCEPTED' logger.debug("Sending doc: %s" % doc) w = w3act(act().url, act().username, act().password) r = w.post_document(doc) if r.status_code == 200: logger.info("Document POSTed to W3ACT: %s" % doc['document_url']) else: logger.error("Failed with %s %s\n%s" % (r.status_code, r.reason, r.text)) raise Exception("Failed with %s %s\n%s" % (r.status_code, r.reason, r.text)) #yield AvailableInWayback(doc['document_url'], doc['wayback_timestamp'], check_available=True) # And write out to the status file with self.output().open('w') as out_file: out_file.write('{}'.format(json.dumps(doc, indent=4))) # Also post to Monitrix if configured to do so: if systems().elasticsearch_host: yield RecordDocumentInMonitrix(self.job, self.launch_id, doc, self.source)
def run(self): # Set up connection to W3ACT: w = w3act(act().url, act().username, act().password) # Grab those targets: targets = w.get_ld_export(self.frequency) # Persist to disk: with self.output().open('w') as f: f.write('{}'.format(json.dumps(targets, indent=4)))
def run(self): # Set up connection to W3ACT: w = w3act(act().url, act().username, act().password) # Grab those targets: targets = w.get_ld_export(self.frequency) # Persist to disk: with self.output().open('w') as f: f.write('{}'.format(json.dumps(targets, indent=4)))
def main(): parser = argparse.ArgumentParser('Interrogate the W3ACT API.') parser.add_argument('-w', '--w3act-url', dest='w3act_url', type=str, default="http://*****:*****@bl.uk", help="W3ACT user email to login with [default: %(default)s]" ) parser.add_argument('-p', '--w3act-pw', dest='w3act_pw', type=str, default="sysAdmin", help="W3ACT user password [default: %(default)s]" ) parser.add_argument('action', metavar='action', help="The action to perform (one of 'add-target', 'list-targets', 'get-target').") args, subargs = parser.parse_known_args() # Connect act = w3act(args.w3act_url,args.w3act_user,args.w3act_pw) if args.action == "list-targets": json = act.get_json("api/targets") print json elif args.action == 'add-target': r = act.post_target(subargs[0], subargs[1]) print r.status_code print r.text elif args.action == 'update-target-schedule': r = act.update_target_schedule(int(subargs[0]), subargs[1], subargs[2]) print r.status_code print r.text elif args.action == 'set-selector': r = act.update_target_selector(int(subargs[0])) print r.status_code print r.text elif args.action == 'watch-target': r = act.watch_target(int(subargs[0])) print r.status_code print r.text elif args.action == 'unwatch-target': r = act.unwatch_target(int(subargs[0])) print r.status_code print r.text elif args.action == 'add-document': doc = {} wtid = subargs[0] doc['target_id'] = int(wtid) doc['wayback_timestamp'] = subargs[1] doc['document_url'] = subargs[2] doc['landing_page_url'] = subargs[3] doc['filename'] = os.path.basename( urlparse(doc['document_url']).path ) doc['size'] = "" logger.debug("Sending doc: %s" % doc) r = act.post_document(doc) print r.status_code print r.text
def uri_of_doc(self, **kwargs): try: logger.info("Got doc to send to W3ACT for: %s" % kwargs) # Set up connection to W3ACT: w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password')) # And post this document up: send_document_to_w3act(kwargs,cfg.get('wayback','endpoint'),w) except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def uri_of_doc(self, **kwargs): try: logger.info("Got doc to send to W3ACT for: %s" % kwargs) # Set up connection to W3ACT: w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'), cfg.get('act', 'password')) # And post this document up: send_document_to_w3act(kwargs, cfg.get('wayback', 'endpoint'), w) except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True): """ Restarts the job for a particular frequency. """ try: logger.info("Stopping/starting %s at %s" % (frequency, start)) # Set up connection to W3ACT: w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password')) # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3','host'), cfg.get('h3','port')), username=cfg.get('h3','username'), password=cfg.get('h3','password')) # Stop job if currently running: if frequency in h.list_jobs() and h.status(frequency) != "": """Stops a running job, notifies RabbitMQ and cleans up the directory.""" launch_id = h.get_launch_id(frequency) job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h) job.stop() remove_action_files(frequency) crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "STOPPED") # Pass on to the next step in the chain: logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id)) assemble_job_output.delay(frequency,launch_id) else: job = None # Start job if requested: if restart: targets = w.get_ld_export(frequency) # logger.info("Found %s Targets in export." % len(export)) # targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)] logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(w, targets, frequency, heritrix=h) logger.info("Starting job %s..." % job.name) job.start() launch_id = h.get_launch_id(frequency) crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED" ) logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)) else: if job: logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id)) return "Stopped job %s/%s without restarting..." % (job.name, launch_id) else: logger.warning("No running '%s' job to stop!" % frequency) return "No running '%s' job to stop!" % frequency except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def surts_from_w3act(allSurts): count = 0 # get licenced urls from w3act w = w3act(args.act_url, args.act_username, args.act_password) acturls = w.get_oa_export("all") # write a copy of w3act urls as a record with open(w3actURLsFile, 'w') as outAct: # for all w3act urls, generate surt for line in acturls: for seed in line["seeds"]: outAct.write("%s\n" % seed) surtVal = generate_surt(seed) allSurts.add(surtVal) count += 1 logger.debug("ACT seed [%s] surt [%s]" % (seed, surtVal)) logger.info("%s surts from ACT generated" % count)
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True): """ Restarts the job for a particular frequency. """ try: logger.info("Stopping/starting %s at %s" % (frequency, start)) # Set up connection to W3ACT: w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'), cfg.get('act', 'password')) # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3', 'host'), cfg.get('h3', 'port')), username=cfg.get('h3', 'username'), password=cfg.get('h3', 'password')) # Stop job if currently running: if frequency in h.list_jobs() and h.status(frequency) != "": """Stops a running job, notifies RabbitMQ and cleans up the directory.""" launch_id = h.get_launch_id(frequency) job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h) job.stop() remove_action_files(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "STOPPED") # Pass on to the next step in the chain: logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id)) assemble_job_output.delay(frequency, launch_id) else: job = None # Start job if requested: if restart: targets = w.get_ld_export(frequency) # logger.info("Found %s Targets in export." % len(export)) # targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)] logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(w, targets, frequency, heritrix=h) logger.info("Starting job %s..." % job.name) job.start() launch_id = h.get_launch_id(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED") logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)) else: if job: logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id)) return "Stopped job %s/%s without restarting..." % (job.name, launch_id) else: logger.warning("No running '%s' job to stop!" % frequency) return "No running '%s' job to stop!" % frequency except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
type=str, default="http://*****:*****@bl.uk", help="W3ACT user email to login with [default: %(default)s]" ) parser.add_argument('-p', '--w3act-pw', dest='w3act_pw', type=str, default="sysAdmin", help="W3ACT user password [default: %(default)s]" ) parser.add_argument('-W', '--wb-url', dest='wb_url', type=str, default="http://localhost:8080/wayback", help="Wayback endpoint to check URL availability [default: %(default)s]" ) args = parser.parse_args() # Set up connection to ACT: act = w3act.w3act(args.w3act_url,args.w3act_user,args.w3act_pw) # Non-matching Target test run_doc_mdex_test('https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/567676/east_dulwich_community_nursery_association.pdf', 'https://www.gov.uk/government/publications/east-dulwich-community-nursery-association-inquiry-report', 'https://www.gov.uk/government/publications?departments[]=department-for-transport', None,"East Dulwich Community Nursery Association") # Title-only extraction tests: run_doc_mdex_test_extraction( "https://www.euromod.ac.uk/sites/default/files/working-papers/em2-01.pdf", "https://www.euromod.ac.uk/publications/date/2001/type/EUROMOD%20Working%20Paper%20Series", "https://www.euromod.ac.uk/", "Towards a multi purpose framework for tax benefit microsimulation") run_doc_mdex_test_extraction( "https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/128968/competency-guidance.pdf",
from crawl.w3act.w3act import w3act from crawl.h3.utils import url_to_surt import logging LOGGING_FORMAT="[%(asctime)s] %(levelname)s: %(message)s" logging.basicConfig(format=LOGGING_FORMAT, level=logging.DEBUG) log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser( 'Grab Open Access targets and output to a file in SURT form.') parser.add_argument('--act-url', dest='act_url', type=str, default="https://www.webarchive.org.uk/act/", help="ACT endpoint to use. [default: %(default)s]") parser.add_argument('--act-username', dest='act_username', type=str, help="ACT username to use. [default: %(default)s]") parser.add_argument('--act-password', dest='act_password', type=str, help="ACT password to use. [default: %(default)s]") parser.add_argument('output_file', metavar='output file', default="/wayback/ldhosts.txt", help="Output file to create, e.g. '/wayback/ldhosts.txt''.") args = parser.parse_args() w = w3act(args.act_url, args.act_username, args.act_password) items = w.get_oa_export("all") surts = ["http://(%s" % url_to_surt(u) for t in items for u in t["seeds"]] with open(args.output_file, "wb") as o: o.write("\n".join(surts))