Exemplo n.º 1
0
    def run(self):
        # initialise
        d_counts = {}
        d_counts['total'] = {}
        d_counts['total']['.uk'] = d_counts['total']['.scot'] = d_counts[
            'total']['.wales'] = d_counts['total']['.cymru'] = d_counts[
                'total']['.london'] = d_counts['total']['not_uk'] = 0
        d_counts['total']['uk_domain'] = d_counts['total'][
            'uk_geoip'] = d_counts['total']['uk_postal_address'] = d_counts[
                'total']['via_correspondence'] = d_counts['total'][
                    'prof_judgement'] = 0
        i_wct_uids = 0
        a_orgs = []
        a_schedules = []

        # enable and start logging
        logger = logging.getLogger()
        logger.debug('Script initialized')

        # get counts
        self.process_frequent_exports(d_counts, a_schedules)
        ## i_wct_uids = get_ukwa_licensed_content(w3act_exporter, logger)
        #i_new_instances = self.calculate_instances()
        i_new_instances = 0
        #i_new_sips = self.calculate_sips()
        i_new_sips = 0

        # calculate organisations and schedules
        a_orgs = Counter(a_orgs)
        a_schedules = Counter(a_schedules)

        # output results
        self.output_results(d_counts, a_orgs, i_wct_uids, a_schedules,
                            i_new_sips, i_new_instances, self.a_ldls, logger)
Exemplo n.º 2
0
 def check_hash(path, file_hash):
     logger.debug("Checking file %s hash %s" % (path, file_hash))
     if len(file_hash) != 128:
         raise Exception("%s hash not 128 character length [%s]" %
                         (path, len(file_hash)))
     if not all(c in string.hexdigits for c in file_hash):
         raise Exception("%s hash not all hex [%s]" % (path, file_hash))
Exemplo n.º 3
0
    def run(self):
        logger.debug("file %s to hash" % (self.path))

        t = luigi.LocalTarget(self.path)
        with t.open('r') as reader:
            file_hash = hashlib.sha512(reader.read()).hexdigest()

        # test hash
        CalculateLocalHash.check_hash(self.path, file_hash)

        with self.output().open('w') as f:
            f.write(file_hash)
Exemplo n.º 4
0
 def mdex_gov_uk_publications(self):
     # Start by grabbing the Link-rel-up header to refine the landing page url:
     # e.g. https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/497662/accidents-involving-illegal-alcohol-levels-2014.pdf
     # Link: <https://www.gov.uk/government/statistics/reported-road-casualties-in-great-britain-estimates-involving-illegal-alcohol-levels-2014>; rel="up"
     r = requests.head(url=self.doc_wb_url())
     if r.links.has_key('up'):
         lpu = r.links['up']
         self.doc["landing_page_url"] = lpu['url']
     # Grab the landing page URL as HTML
     logger.debug("Downloading and parsing: %s" %
                  self.doc['landing_page_url'])
     r = requests.get(self.lp_wb_url())
     h = html.fromstring(r.content)
     # Extract the metadata:
     logger.debug('xpath/title %s' % h.xpath('//header//h1/text()'))
     self.doc['title'] = self._get0(h.xpath('//header//h1/text()'))
     self.doc['publication_date'] = self._get0(
         h.xpath("//aside[contains(@class, 'meta')]//time/@datetime"))[0:10]
     if self.doc['publication_date'] == '':
         self.doc.pop('publication_date')
     self.doc['publishers'] = h.xpath(
         "//aside[contains(@class, 'meta')]//a[contains(@class, 'organisation-link')]/text()"
     )
     # Look through landing page for links, find metadata section corresponding to the document:
     for a in h.xpath("//a"):
         if self.doc["document_url"] in urljoin(
                 self.doc["landing_page_url"], a.attrib["href"]):
             if ("class" in a.getparent().getparent().attrib) and \
                             a.getparent().getparent().attrib["class"] == "attachment-details":
                 div = a.getparent().getparent()
                 # Process title, allowing document title metadata to override:
                 lp_title = self._get0(
                     div.xpath("./h2[@class='title']/a/text()"))
                 if len(lp_title) > 0:
                     self.doc['title'] = lp_title
                 # Process references
                 refs = div.xpath("./p/span[@class='references']")
                 # We also need to look out for Command and Act papers and match them by modifying the publisher list
                 for ref in refs:
                     isbn = self._get0(
                         ref.xpath("./span[@class='isbn']/text()"))
                     if len(isbn) > 0:
                         self.doc['isbn'] = isbn
                     if len(
                             ref.xpath(
                                 "./span[starts-with(text(), 'HC') or starts-with(text(), 'Cm') or starts-with(text(), 'CM')]"
                             )) > 0:
                         self.doc['publishers'] = ["Command and Act Papers"]
     if not self.doc['title']:
         raise Exception(
             'Title extraction failed! Metadata extraction for this target should be reviewed.'
         )
Exemplo n.º 5
0
 def calculate_instances(self):
     i_new_instances = 0
     o_targets = self.client.list(
         "/data/wayback/cdx-index/")["FileStatuses"]["FileStatus"]
     for o_target in o_targets:
         o_instances = self.client.list("/data/wayback/cdx-index/%s/" \
             % o_target["pathSuffix"])["FileStatuses"]["FileStatus"]
         for o_instance in o_instances:
             i_mod = datetime.fromtimestamp(o_instance["modificationTime"] /
                                            1000)
             if i_mod > (datetime.now() - relativedelta(months=-1)):
                 i_new_instances += 1
     logger.debug('New instances = ' + str(i_new_instances))
     return i_new_instances
Exemplo n.º 6
0
    def run(self):
        logger.debug("file %s to hash" % (self.path))

        # get hash for local or hdfs file
        t = self.input()
        client = luigi.contrib.hdfs.get_autoconfig_client(threading.local())
        # Having to side-step the first client as it seems to be buggy/use an old API - note also confused put()
        with client.client.read(str(t.path)) as reader:
            file_hash = hashlib.sha512(reader.read()).hexdigest()

        # test hash
        CalculateLocalHash.check_hash(self.path, file_hash)

        with self.output().open('w') as f:
            f.write(file_hash)
Exemplo n.º 7
0
 def calculate_sips(self):
     i_new_sips = 0
     o_dirs = self.client.list("/heritrix/sips/")
     logger.info(o_dirs)
     for o_dir in o_dirs:
         logger.info(o_dir)
         o_sips = self.client.list(
             "/heritrix/sips/%s/" %
             o_dir["pathSuffix"])["FileStatuses"]["FileStatus"]
         for o_sip in o_sips:
             i_mod = datetime.fromtimestamp(o_sip["modificationTime"] /
                                            1000)
             if i_mod > (datetime.now() + relativedelta(months=-1)):
                 i_new_sips += 1
     logger.debug('New SIPs = ' + str(i_new_sips))
     return i_new_sips
Exemplo n.º 8
0
 def mdex_gov_uk_publications(self):
     # Start by grabbing the Link-rel-up header to refine the landing page url:
     # e.g. https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/497662/accidents-involving-illegal-alcohol-levels-2014.pdf
     # Link: <https://www.gov.uk/government/statistics/reported-road-casualties-in-great-britain-estimates-involving-illegal-alcohol-levels-2014>; rel="up"
     r = requests.head(url=self.doc_wb_url())
     if r.links.has_key('up'):
         lpu = r.links['up']
         self.doc["landing_page_url"] = lpu['url']
     # Grab the landing page URL as HTML
     logger.debug("Downloading and parsing: %s" % self.doc['landing_page_url'])
     r = requests.get(self.lp_wb_url())
     h = html.fromstring(r.content)
     # Extract the metadata:
     logger.debug('xpath/title %s' % h.xpath('//header//h1/text()') )
     self.doc['title'] = self._get0(h.xpath('//header//h1/text()'))
     self.doc['publication_date'] = self._get0(h.xpath("//aside[contains(@class, 'meta')]//time/@datetime"))[0:10]
     if self.doc['publication_date'] == '':
         self.doc.pop('publication_date')
     self.doc['publishers'] = h.xpath("//aside[contains(@class, 'meta')]//a[contains(@class, 'organisation-link')]/text()")
     # Look through landing page for links, find metadata section corresponding to the document:
     for a in h.xpath("//a"):
         if self.doc["document_url"] in urljoin(self.doc["landing_page_url"], a.attrib["href"]):
             if ("class" in a.getparent().getparent().attrib) and \
                             a.getparent().getparent().attrib["class"] == "attachment-details":
                 div = a.getparent().getparent()
                 # Process title, allowing document title metadata to override:
                 lp_title = self._get0(div.xpath("./h2[@class='title']/a/text()"))
                 if len(lp_title) > 0:
                     self.doc['title'] = lp_title
                 # Process references
                 refs = div.xpath("./p/span[@class='references']")
                 # We also need to look out for Command and Act papers and match them by modifying the publisher list
                 for ref in refs:
                     isbn = self._get0(ref.xpath("./span[@class='isbn']/text()"))
                     if len(isbn) > 0:
                         self.doc['isbn'] = isbn
                     if len(ref.xpath("./span[starts-with(text(), 'HC') or starts-with(text(), 'Cm') or starts-with(text(), 'CM')]")) > 0:
                         self.doc['publishers'] = ["Command and Act Papers"]
     if not self.doc['title']:
         raise Exception('Title extraction failed! Metadata extraction for this target should be reviewed.')
Exemplo n.º 9
0
 def get_ukwa_licensed_content(self, w3act_exporter, logger):
     i_wct_uids = 0
     logger.debug('Getting W3ACT export get_by_all')
     try:
         export_all = w3act_exporter.get_by_export("all")
         i_wct_uids = len(export_all)
         logger.debug('Size of get_by_all export ' + str(i_wct_uids))
     except:
         logger.debug('get_by_all export failed')
         i_wct_uids = 'ERROR: stats.py script failed to export get_by_all from W3ACT'
     return i_wct_uids
Exemplo n.º 10
0
    def parse_crawl_log(self, logs):
        """
        Parses the crawl log to check the WARCs are present.
        :return:
        """
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        warcfiles = set()
        remote_log = luigi.contrib.ssh.RemoteTarget(logs[0], self.host)
        with remote_log.open('r') as f:
            for line in f:
                parts = re.split(" +", line, maxsplit=11)
                # Skip failed downloads:
                if parts[1] == '-' or parts[1] == '' or int(parts[1]) <= 0:
                    if parts[1] == '':
                        logger.info(
                            "Skipping line with empty status! '%s' from log file '%s'"
                            % (line, logs[0]))
                    continue
                # Skip locally-resolved DNS records
                if parts[1] == "1001":
                    logger.debug(
                        "Skipping finding WARC for locally-defined hostname: %s"
                        % parts[3])
                    continue
                # Attempt to parse JSON
                try:
                    (annotations, line_json) = re.split("{",
                                                        parts[11],
                                                        maxsplit=1)
                    line_json = "{%s" % line_json
                    # logger.debug("LOG JSON: %s" % line_json)
                    # logger.debug("LOG ANNOTATIONS: %s" % annotations)
                    jmd = json.loads(line_json)
                except Exception as e:
                    logger.info("LOG LINE: %s" % line)
                    logger.info("LOG LINE part[11]: %s" % parts[11])
                    logger.exception(e)
                    raise e
                if 'warcFilename' in jmd:
                    warcfiles.add(jmd['warcFilename'])
                elif 'warcPrefix' in jmd:
                    for wren in remote_ls(LOCAL_WREN_FOLDER,
                                          "%s*.warc.gz*" % jmd['warcPrefix'],
                                          rf):
                        if wren.endswith('.open'):
                            wren = wren[:-5]
                        warcfiles.add(os.path.basename(wren))
                    # Also check in case file has already been moved into output/warcs/{job}/{launch}:
                    for wren in remote_ls(self.warc_file_path(),
                                          "%s*.warc.gz*" % jmd['warcPrefix'],
                                          rf):
                        warcfiles.add(os.path.basename(wren))
                    # FIXME Also look on HDFS for matching files?
                else:
                    logger.warning("No WARC file entry found for line: %s" %
                                   line)

        warcs = []
        viral = []
        for warcfile in warcfiles:
            if self._file_exists("%s/%s" % (self.viral_file_path(), warcfile),
                                 rf):
                logger.info("Found Viral WARC %s/%s" %
                            (self.viral_file_path(), warcfile))
                viral.append("%s/%s" % (self.viral_file_path(), warcfile))
            elif self._file_exists("%s/%s" % (LOCAL_WREN_FOLDER, warcfile),
                                   rf):
                logger.info("Found WREN WARC %s" % warcfile)
                warcs.append("%s/%s" % (LOCAL_WREN_FOLDER, warcfile))
            elif self._file_exists("%s/%s" % (self.warc_file_path(), warcfile),
                                   rf):
                logger.info("Found WARC %s/%s" %
                            (self.warc_file_path(), warcfile))
                warcs.append("%s/%s" % (self.warc_file_path(), warcfile))
            else:
                raise Exception("Cannot file warc file %s" % warcfile)

        return warcs, viral
Exemplo n.º 11
0
    def process_frequent_exports(self, d_counts, a_schedules):
        # for each crawl frequency
        for frequency in self.input():
            logger.debug('Reading W3ACT export for ' + frequency)
            freq_export = json.load(self.input()[frequency].open())

            d_counts[frequency] = {}
            d_counts[frequency]['.uk'] = d_counts[frequency][
                '.scot'] = d_counts[frequency]['.wales'] = d_counts[frequency][
                    '.cymru'] = d_counts[frequency]['.london'] = d_counts[
                        frequency]['not_uk'] = 0
            d_counts[frequency]['uk_domain'] = d_counts[frequency][
                'uk_geoip'] = d_counts[frequency][
                    'uk_postal_address'] = d_counts[frequency][
                        'via_correspondence'] = d_counts[frequency][
                            'prof_judgement'] = 0

            # for each frequency with collected data, count URL country codes
            if freq_export is None:
                logger.debug("None returned for " + frequency)
            else:
                for node in freq_export:
                    #logger.info(node)
                    a_schedules.append(
                        frequency)  # This doesn't really make sense I think?
                    for url in [u["url"] for u in node["fieldUrls"]]:
                        if urlparse(url).netloc.endswith(".uk"):
                            d_counts[frequency]['.uk'] += 1
                            d_counts[frequency]['uk_domain'] += 1
                        elif urlparse(url).netloc.endswith(".london"):
                            d_counts[frequency]['.london'] += 1
                            d_counts[frequency]['uk_domain'] += 1
                        elif urlparse(url).netloc.endswith(".wales"):
                            d_counts[frequency]['.wales'] += 1
                            d_counts[frequency]['uk_domain'] += 1
                        elif urlparse(url).netloc.endswith(".cymru"):
                            d_counts[frequency]['.cymru'] += 1
                            d_counts[frequency]['uk_domain'] += 1
                        elif urlparse(url).netloc.endswith(".scot"):
                            d_counts[frequency]['.scot'] += 1
                            d_counts[frequency]['uk_domain'] += 1
                        else:
                            d_counts[frequency]['not_uk'] += 1

                    if node["field_uk_hosting"]:
                        d_counts[frequency]['uk_geoip'] += 1
                    if node["field_uk_postal_address"]:
                        d_counts[frequency]['uk_postal_address'] += 1
                    if node["field_via_correspondence"]:
                        d_counts[frequency]['via_correspondence'] += 1
                    if node["field_professional_judgement"]:
                        d_counts[frequency]['prof_judgement'] += 1

            # log frequency counts
            for subset in sorted(d_counts[frequency]):
                logger.debug("\t" + subset + " = " +
                             str(d_counts[frequency][subset]))

            # accumulate total values
            for subset in d_counts[frequency]:
                d_counts['total'][subset] += d_counts[frequency][subset]

        # log count totals
        for subset in sorted(d_counts['total']):
            logger.info(subset + " = " + str(d_counts['total'][subset]))