def create_profile(self, HERITRIX_JOBS): """Creates the CXML content for a H3 job.""" profile = etree.parse(HERITRIX_PROFILE) profile.xinclude() cxml = etree.tostring(profile, pretty_print=True, xml_declaration=True, encoding="UTF-8") logging.getLogger('luigi-interface').error("HERITRIX_PROFILE %s" % HERITRIX_PROFILE) logging.getLogger('luigi-interface').error("self.name %s" % self.name) cxml = cxml.replace("REPLACE_JOB_NAME", self.name) if self.name in CLAMD_HOSTS.keys(): cxml = cxml.replace("REPLACE_CLAMD_HOST", CLAMD_HOSTS[self.name]) else: cxml = cxml.replace("REPLACE_CLAMD_HOST", CLAMD_DEFAULT_HOST) if self.name in CLAMD_PORTS.keys(): cxml = cxml.replace("REPLACE_CLAMD_PORT", CLAMD_PORTS[self.name]) else: cxml = cxml.replace("REPLACE_CLAMD_PORT", CLAMD_DEFAULT_PORT) cxml = cxml.replace("REPLACE_JOB_ROOT", self.name) cxml = cxml.replace("REPLACE_HERITRIX_JOBS", HERITRIX_JOBS) cxml = cxml.replace("REPLACE_AMQP_HOST", systems().amqp_host) cxml = cxml.replace("REPLACE_WRENDER_ENDPOINT", systems().wrender) self.cxml = cxml
def unpause_dc(): servers = json.load(systems().servers) services = json.load(systems().services) for job in ['dc0-2016', 'dc1-2016', 'dc2-2016', 'dc3-2016']: server = servers[services['jobs'][job]['server']] h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass']) h.unpause_job(services['jobs'][job]['name']) return redirect(url_for('status'))
def lookup_in_cdx(qurl): """ Checks if a resource is in the CDX index. :return: """ query = "%s?q=type:urlquery+url:%s" % (systems().cdxserver, quote(qurl)) r = requests.get(query) print(r.url) app.logger.debug("Availability response: %d" % r.status_code) print(r.status_code, r.text) # Is it known, with a matching timestamp? if r.status_code == 200: try: dom = xml.dom.minidom.parseString(r.text) for result in dom.getElementsByTagName('result'): file = result.getElementsByTagName( 'file')[0].firstChild.nodeValue compressedoffset = result.getElementsByTagName( 'compressedoffset')[0].firstChild.nodeValue return file, compressedoffset except Exception as e: app.logger.error("Lookup failed for %s!" % qurl) app.logger.exception(e) #for de in dom.getElementsByTagName('capturedate'): # if de.firstChild.nodeValue == self.ts: # # Excellent, it's been found: # return return None, None
def lookup_in_cdx(qurl): """ Checks if a resource is in the CDX index. :return: """ query = "%s?q=type:urlquery+url:%s" % (systems().cdxserver, quote(qurl)) r = requests.get(query) print(r.url) app.logger.debug("Availability response: %d" % r.status_code) print(r.status_code, r.text) # Is it known, with a matching timestamp? if r.status_code == 200: try: dom = xml.dom.minidom.parseString(r.text) for result in dom.getElementsByTagName('result'): file = result.getElementsByTagName('file')[0].firstChild.nodeValue compressedoffset = result.getElementsByTagName('compressedoffset')[0].firstChild.nodeValue return file, compressedoffset except Exception as e: app.logger.error("Lookup failed for %s!" % qurl) app.logger.exception(e) #for de in dom.getElementsByTagName('capturedate'): # if de.firstChild.nodeValue == self.ts: # # Excellent, it's been found: # return return None, None
def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream(DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
requests.packages.urllib3.disable_warnings() mandatory_fields = ["field_url", "field_depth", "field_scope", "url"] depth_sheets = {"capped_large": "higherLimit", "deep": "noLimit"} scope_sheets = {"resource": "resourceScope", "plus1": "plus1Scope", "subdomains": "subdomainsScope"} W3ACT_FIELDS=["id", "title", "schedules", "depth", "scope", "ignoreRobotsTxt"] HERITRIX_CONFIG_ROOT=os.path.realpath(os.path.join(os.path.dirname(__file__),"../profiles")) HERITRIX_PROFILE="%s/profile-pulse.cxml" % HERITRIX_CONFIG_ROOT HERITRIX_EXCLUDE="%s/exclude.txt" % HERITRIX_CONFIG_ROOT HERITRIX_SHORTENERS="%s/url.shorteners.txt" % HERITRIX_CONFIG_ROOT HERITRIX_SURTS="%s/surts.txt" % HERITRIX_CONFIG_ROOT CLAMD_DEFAULT_HOST = systems().clamd_host CLAMD_DEFAULT_PORT = systems().clamd_port CLAMD_PORTS = { } #"daily": "3310", "weekly": "3310", "monthly": "3310", "quarterly": "3310", "sixmonthly": "3310", "annual": "3310" } CLAMD_HOSTS = { } def to_surt(url): parsed = urlparse(url).netloc authority = parsed.split(".") authority.reverse() return "http://(%s," % ",".join(authority) def get_surt_association_script(surt, sheet):
def doc_wb_url(self): wb_url = "%s/%s/%s" % (systems().wayback, self.doc['wayback_timestamp'], self.doc['document_url']) return wb_url
def lp_wb_url(self): wb_url = "%s/%s/%s" % (systems().wayback, self.doc['wayback_timestamp'], self.doc['landing_page_url']) return wb_url
"plus1": "plus1Scope", "subdomains": "subdomainsScope" } W3ACT_FIELDS = [ "id", "title", "schedules", "depth", "scope", "ignoreRobotsTxt" ] HERITRIX_CONFIG_ROOT = os.path.realpath( os.path.join(os.path.dirname(__file__), "../profiles")) HERITRIX_PROFILE = "%s/profile-pulse.cxml" % HERITRIX_CONFIG_ROOT HERITRIX_EXCLUDE = "%s/exclude.txt" % HERITRIX_CONFIG_ROOT HERITRIX_SHORTENERS = "%s/url.shorteners.txt" % HERITRIX_CONFIG_ROOT HERITRIX_SURTS = "%s/surts.txt" % HERITRIX_CONFIG_ROOT CLAMD_DEFAULT_HOST = systems().clamd_host CLAMD_DEFAULT_PORT = systems().clamd_port CLAMD_PORTS = {} #"daily": "3310", "weekly": "3310", "monthly": "3310", "quarterly": "3310", "sixmonthly": "3310", "annual": "3310" } CLAMD_HOSTS = {} def to_surt(url): parsed = urlparse(url).netloc authority = parsed.split(".") authority.reverse() return "http://(%s," % ",".join(authority) def get_surt_association_script(surt, sheet):
def doc_wb_url(self): wb_url = "%s/%s/%s" % ( systems().wayback, self.doc['wayback_timestamp'], self.doc['document_url']) return wb_url