Exemplo n.º 1
0
 def create_profile(self, HERITRIX_JOBS):
     """Creates the CXML content for a H3 job."""
     profile = etree.parse(HERITRIX_PROFILE)
     profile.xinclude()
     cxml = etree.tostring(profile,
                           pretty_print=True,
                           xml_declaration=True,
                           encoding="UTF-8")
     logging.getLogger('luigi-interface').error("HERITRIX_PROFILE %s" %
                                                HERITRIX_PROFILE)
     logging.getLogger('luigi-interface').error("self.name %s" % self.name)
     cxml = cxml.replace("REPLACE_JOB_NAME", self.name)
     if self.name in CLAMD_HOSTS.keys():
         cxml = cxml.replace("REPLACE_CLAMD_HOST", CLAMD_HOSTS[self.name])
     else:
         cxml = cxml.replace("REPLACE_CLAMD_HOST", CLAMD_DEFAULT_HOST)
     if self.name in CLAMD_PORTS.keys():
         cxml = cxml.replace("REPLACE_CLAMD_PORT", CLAMD_PORTS[self.name])
     else:
         cxml = cxml.replace("REPLACE_CLAMD_PORT", CLAMD_DEFAULT_PORT)
     cxml = cxml.replace("REPLACE_JOB_ROOT", self.name)
     cxml = cxml.replace("REPLACE_HERITRIX_JOBS", HERITRIX_JOBS)
     cxml = cxml.replace("REPLACE_AMQP_HOST", systems().amqp_host)
     cxml = cxml.replace("REPLACE_WRENDER_ENDPOINT", systems().wrender)
     self.cxml = cxml
Exemplo n.º 2
0
def unpause_dc():
    servers = json.load(systems().servers)
    services = json.load(systems().services)
    for job in ['dc0-2016', 'dc1-2016', 'dc2-2016', 'dc3-2016']:
        server = servers[services['jobs'][job]['server']]
        h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass'])
        h.unpause_job(services['jobs'][job]['name'])
    return redirect(url_for('status'))
Exemplo n.º 3
0
def unpause_dc():
    servers = json.load(systems().servers)
    services = json.load(systems().services)
    for job in ['dc0-2016', 'dc1-2016', 'dc2-2016', 'dc3-2016']:
        server = servers[services['jobs'][job]['server']]
        h = hapyx.HapyX(server['url'],
                        username=server['user'],
                        password=server['pass'])
        h.unpause_job(services['jobs'][job]['name'])
    return redirect(url_for('status'))
Exemplo n.º 4
0
def lookup_in_cdx(qurl):
    """
    Checks if a resource is in the CDX index.
    :return:
    """
    query = "%s?q=type:urlquery+url:%s" % (systems().cdxserver, quote(qurl))
    r = requests.get(query)
    print(r.url)
    app.logger.debug("Availability response: %d" % r.status_code)
    print(r.status_code, r.text)
    # Is it known, with a matching timestamp?
    if r.status_code == 200:
        try:
            dom = xml.dom.minidom.parseString(r.text)
            for result in dom.getElementsByTagName('result'):
                file = result.getElementsByTagName(
                    'file')[0].firstChild.nodeValue
                compressedoffset = result.getElementsByTagName(
                    'compressedoffset')[0].firstChild.nodeValue
                return file, compressedoffset
        except Exception as e:
            app.logger.error("Lookup failed for %s!" % qurl)
            app.logger.exception(e)
        #for de in dom.getElementsByTagName('capturedate'):
        #    if de.firstChild.nodeValue == self.ts:
        #        # Excellent, it's been found:
        #        return
    return None, None
Exemplo n.º 5
0
def lookup_in_cdx(qurl):
    """
    Checks if a resource is in the CDX index.
    :return:
    """
    query = "%s?q=type:urlquery+url:%s" % (systems().cdxserver, quote(qurl))
    r = requests.get(query)
    print(r.url)
    app.logger.debug("Availability response: %d" % r.status_code)
    print(r.status_code, r.text)
    # Is it known, with a matching timestamp?
    if r.status_code == 200:
        try:
            dom = xml.dom.minidom.parseString(r.text)
            for result in dom.getElementsByTagName('result'):
                file = result.getElementsByTagName('file')[0].firstChild.nodeValue
                compressedoffset = result.getElementsByTagName('compressedoffset')[0].firstChild.nodeValue
                return file, compressedoffset
        except Exception as e:
            app.logger.error("Lookup failed for %s!" % qurl)
            app.logger.exception(e)
        #for de in dom.getElementsByTagName('capturedate'):
        #    if de.firstChild.nodeValue == self.ts:
        #        # Excellent, it's been found:
        #        return
    return None, None
Exemplo n.º 6
0
def get_rendered_original():
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    url = request.args.get('url')
    app.logger.debug("Got URL: %s" % url)
    #
    type = request.args.get('type', 'screenshot')
    app.logger.debug("Got type: %s" % type)

    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    (warc_filename, warc_offset) = lookup_in_cdx(qurl)

    # If not found, say so:
    if warc_filename is None:
        abort(404)

    # Grab the payload from the WARC and return it.
    r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder,
                                                           warc_filename, webhdfs().user, warc_offset))
    app.logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=io.BytesIO(r.content)))
    print(record)
    print(record.length)
    print(record.stream.limit)

    return send_file(record.stream, mimetype=record.content_type)
Exemplo n.º 7
0
 def create_profile(self, HERITRIX_JOBS):
     """Creates the CXML content for a H3 job."""
     profile = etree.parse(HERITRIX_PROFILE)
     profile.xinclude()
     cxml = etree.tostring(profile, pretty_print=True, xml_declaration=True, encoding="UTF-8")
     logging.getLogger('luigi-interface').error("HERITRIX_PROFILE %s" % HERITRIX_PROFILE)
     logging.getLogger('luigi-interface').error("self.name %s" % self.name)
     cxml = cxml.replace("REPLACE_JOB_NAME", self.name)
     if self.name in CLAMD_HOSTS.keys():
         cxml = cxml.replace("REPLACE_CLAMD_HOST", CLAMD_HOSTS[self.name])
     else:
         cxml = cxml.replace("REPLACE_CLAMD_HOST", CLAMD_DEFAULT_HOST)
     if self.name in CLAMD_PORTS.keys():
         cxml = cxml.replace("REPLACE_CLAMD_PORT", CLAMD_PORTS[self.name])
     else:
         cxml = cxml.replace("REPLACE_CLAMD_PORT", CLAMD_DEFAULT_PORT)
     cxml = cxml.replace("REPLACE_JOB_ROOT", self.name)
     cxml = cxml.replace("REPLACE_HERITRIX_JOBS", HERITRIX_JOBS)
     cxml = cxml.replace("REPLACE_AMQP_HOST", systems().amqp_host)
     cxml = cxml.replace("REPLACE_WRENDER_ENDPOINT", systems().wrender)
     self.cxml = cxml
Exemplo n.º 8
0
def get_rendered_original():
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    url = request.args.get('url')
    app.logger.debug("Got URL: %s" % url)
    #
    type = request.args.get('type', 'screenshot')
    app.logger.debug("Got type: %s" % type)

    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    (warc_filename, warc_offset) = lookup_in_cdx(qurl)

    # If not found, say so:
    if warc_filename is None:
        abort(404)

    # Grab the payload from the WARC and return it.
    r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" %
                     (systems().webhdfs, h3().hdfs_root_folder, warc_filename,
                      webhdfs().user, warc_offset))
    app.logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(
        DecompressingBufferedReader(stream=io.BytesIO(r.content)))
    print(record)
    print(record.length)
    print(record.stream.limit)

    return send_file(record.stream, mimetype=record.content_type)
Exemplo n.º 9
0
requests.packages.urllib3.disable_warnings()

mandatory_fields = ["field_url", "field_depth", "field_scope", "url"]
depth_sheets = {"capped_large": "higherLimit", "deep": "noLimit"}
scope_sheets = {"resource": "resourceScope", "plus1": "plus1Scope", "subdomains": "subdomainsScope"}

W3ACT_FIELDS=["id", "title", "schedules", "depth", "scope", "ignoreRobotsTxt"]

HERITRIX_CONFIG_ROOT=os.path.realpath(os.path.join(os.path.dirname(__file__),"../profiles"))
HERITRIX_PROFILE="%s/profile-pulse.cxml" % HERITRIX_CONFIG_ROOT
HERITRIX_EXCLUDE="%s/exclude.txt" % HERITRIX_CONFIG_ROOT
HERITRIX_SHORTENERS="%s/url.shorteners.txt" % HERITRIX_CONFIG_ROOT
HERITRIX_SURTS="%s/surts.txt" % HERITRIX_CONFIG_ROOT

CLAMD_DEFAULT_HOST = systems().clamd_host
CLAMD_DEFAULT_PORT = systems().clamd_port

CLAMD_PORTS = { }
#"daily": "3310", "weekly": "3310", "monthly": "3310", "quarterly": "3310", "sixmonthly": "3310", "annual": "3310" }
CLAMD_HOSTS = { }


def to_surt(url):
        parsed = urlparse(url).netloc
        authority = parsed.split(".")
        authority.reverse()
        return "http://(%s," % ",".join(authority)


def get_surt_association_script(surt, sheet):
Exemplo n.º 10
0
 def doc_wb_url(self):
     wb_url = "%s/%s/%s" % (systems().wayback,
                            self.doc['wayback_timestamp'],
                            self.doc['document_url'])
     return wb_url
Exemplo n.º 11
0
 def lp_wb_url(self):
     wb_url = "%s/%s/%s" % (systems().wayback,
                            self.doc['wayback_timestamp'],
                            self.doc['landing_page_url'])
     return wb_url
Exemplo n.º 12
0
    "plus1": "plus1Scope",
    "subdomains": "subdomainsScope"
}

W3ACT_FIELDS = [
    "id", "title", "schedules", "depth", "scope", "ignoreRobotsTxt"
]

HERITRIX_CONFIG_ROOT = os.path.realpath(
    os.path.join(os.path.dirname(__file__), "../profiles"))
HERITRIX_PROFILE = "%s/profile-pulse.cxml" % HERITRIX_CONFIG_ROOT
HERITRIX_EXCLUDE = "%s/exclude.txt" % HERITRIX_CONFIG_ROOT
HERITRIX_SHORTENERS = "%s/url.shorteners.txt" % HERITRIX_CONFIG_ROOT
HERITRIX_SURTS = "%s/surts.txt" % HERITRIX_CONFIG_ROOT

CLAMD_DEFAULT_HOST = systems().clamd_host
CLAMD_DEFAULT_PORT = systems().clamd_port

CLAMD_PORTS = {}
#"daily": "3310", "weekly": "3310", "monthly": "3310", "quarterly": "3310", "sixmonthly": "3310", "annual": "3310" }
CLAMD_HOSTS = {}


def to_surt(url):
    parsed = urlparse(url).netloc
    authority = parsed.split(".")
    authority.reverse()
    return "http://(%s," % ",".join(authority)


def get_surt_association_script(surt, sheet):
Exemplo n.º 13
0
 def doc_wb_url(self):
     wb_url = "%s/%s/%s" % ( systems().wayback, self.doc['wayback_timestamp'], self.doc['document_url'])
     return wb_url