示例#1
0
def scrape(scraper, tree):
    scraper.dataset.title = tree.xpath("//h1/text()")[0].strip()
    scraper.dataset.issued = parse(tree.xpath(
        "//p[contains(concat(' ', @class, ' '), ' date-pub ')]/span[@class='date-display-single']/text()")[0],
                                   dayfirst=True).date()
    scraper.dataset.publisher = GOV['department-of-health-northern-ireland']
    for doc_link in tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' publicationDocs ')]"
            "//div[contains(concat(' ', @class, ' '), ' nigovfile ')]/a"):
        dist = Distribution(scraper)
        dist.downloadURL = doc_link.get('href')
        dist.title = doc_link.xpath("text()")[0].strip()
        type_size = doc_link.xpath("span[@class='meta']/text()")[0].strip()
        match = type_size_re.match(type_size)
        if match:
            if match.group(1) == 'PDF':
                dist.mediaType = PDF
            else:
                dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
            size = float(match.group(2))
            if match.group(3) == 'KB':  # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                dist.byteSize = int(size * 1024)
            elif match.group(3) == 'MB':  # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                dist.byteSize = int(size * 1000000)
            elif match.group(3) == 'GB':  # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes
                dist.byteSize = int(size * 1000000000)
        scraper.distributions.append(dist)
示例#2
0
def extract_distributions(distributions, link_tree, scraper):
    div_attach = next(iter(link_tree.xpath("div[@class='attachment-details']")), None)
    if div_attach is not None:
        div_metadata = next(iter(div_attach.xpath("p[@class='metadata']")), None)
        if div_metadata is not None:
            span_type = next(iter(div_metadata.xpath("span[@class='type']")), None)
            if span_type is not None:
                span_size = next(iter(div_metadata.xpath("span[@class='file-size']/text()")), None)
                if span_size is not None:
                    dist = Distribution(scraper)
                    # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                    # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    if span_size.endswith('KB'):
                        dist.byteSize = int(float(span_size[:-2]) * 1024)
                    elif span_size.endswith('kB'):
                        dist.byteSize = int(float(span_size[:-2]) * 1000)
                    elif span_size.endswith('MB'):
                        dist.byteSize = int(float(span_size[:-2]) * 1000000)
                    anchor = next(iter(div_attach.xpath("h2/a")), None)
                    if anchor is not None:
                        url = anchor.get('href')
                        if url is not None:
                            dist.downloadURL = urljoin('https://www.gov.uk/', url)
                        if hasattr(anchor, 'text'):
                            dist.title = anchor.text.strip()
                    dist.mediaType, encoding = mimetypes.guess_type(dist.downloadURL)
                    abbr_type = next(iter(span_type.xpath("abbr/text()")), None)
                    if abbr_type is not None:
                        if abbr_type.upper() == 'PDF':
                            dist.mediaType = PDF
                    distributions.append(dist)
示例#3
0
def scrape(scraper, tree):
    # It's not clear whether the pages are collections of datasets or datasets with distributions.
    # Assume the latter for simplicity for now.
    scraper.dataset.publisher = GOV['welsh-government']
    # OGLv3 license is quoted for the whole site on https://gov.wales/copyright-statement
    scraper.dataset.rights = "https://gov.wales/copyright-statement"
    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
    scraper.dataset.title = tree.xpath('//h1//text()')[0].strip()
    scraper.dataset.description = tree.xpath(
        "//div[contains(concat(' ', @class, ' '), ' hero-block__summary ')]/div/p/text()"
    )[0].strip()
    meta = tree.xpath("//div[@class='header-meta']")[0]
    published = meta.xpath(
        "div[contains(concat(' ', @class, ' '), ' first-published ')]/" + \
        "div[contains(concat(' ', @class, ' '), ' item ')]/text()")[0].strip()
    scraper.dataset.issued = parse(published, dayfirst=True)
    updated = meta.xpath(
        "div[contains(concat(' ', @class, ' '), ' last-updated ')]/" + \
        "div[contains(concat(' ', @class, ' '), ' item ')]//time/@datetime")[0].strip()
    scraper.dataset.modified = isoparse(updated)

    @lru_cache()
    def fetch_page(url):
        page = scraper.session.get(url)
        return html.fromstring(page.text)

    for article in tree.xpath("//div[@role='article']"):
        title_div = article.xpath("div[@class = 'index-list__title']")[0]
        meta_div = article.xpath("div[@class = 'index-list__meta']")[0]
        release_page = fetch_page(title_div.xpath('a/@href')[0])
        for details in release_page.xpath(
                "//div[@id = 'release--data']//div[@class = 'document__details']"
        ):
            distribution = Distribution(scraper)
            distribution.downloadURL = details.xpath("h3/a/@href")[0]
            distribution.title = details.xpath("h3/a/div/text()")[0].strip()
            distribution.issued = isoparse(
                details.xpath(
                    "//div[contains(concat(' ', @class, ' '), ' meta__released ')]//time/@datetime"
                )[0])
            distribution.modified = isoparse(
                details.xpath(
                    "//div[contains(concat(' ', @class, ' '), ' meta__update_history ')]//time/@datetime"
                )[0])
            dist_meta = details.xpath("h3/a/span/text()")[0].strip()
            meta_match = FILE_TYPE_AND_SIZE_RE.match(dist_meta)
            if meta_match:
                distribution.mediaType = {'ODS': ODS}.get(meta_match.group(1))
                size_qualifier = meta_match.group(3)
                size = float(meta_match.group(2))
                if size_qualifier == "KB":
                    distribution.byteSize = int(size * 1024)
                elif size_qualifier == "kB":
                    distribution.byteSize = int(size * 1000)
            else:
                distribution.mediaType, _ = mimetypes.guess_type(
                    distribution.downloadURL)
            scraper.distributions.append(distribution)
示例#4
0
def content_api_publication(scraper, metadata):
    ds = Dataset(scraper.uri)
    if 'title' in metadata:
        ds.title = metadata['title']
    if 'description' in metadata:
        ds.comment = metadata['description']
    if 'details' in metadata:
        # TODO, depends on outcome of https://github.com/GSS-Cogs/gss-utils/issues/308
        ds.description = html2text.html2text(metadata["details"]["body"])
    if 'api_url' in metadata:
        doc_info = scraper.session.get(metadata['api_url']).json()
    else:
        doc_info = metadata
    if 'first_published_at' in doc_info:
        ds.issued = datetime.fromisoformat(doc_info['first_published_at'])
    if 'public_updated_at' in doc_info:
        ds.modified = datetime.fromisoformat(doc_info['public_updated_at'])
    if 'description' in doc_info:
        ds.comment = doc_info['description']
    if 'description' in doc_info:
        # TODO, depends on outcome of https://github.com/GSS-Cogs/gss-utils/issues/308
        ds.description = html2text.html2text(doc_info["details"]["body"])
    if 'links' in doc_info and 'organisations' in doc_info['links']:
        orgs = doc_info['links']['organisations']
        if len(orgs) == 0:
            logging.warning("No publishing organisations listed.")
        elif len(orgs) >= 1:
            if len(orgs) > 1:
                logging.warning(
                    'More than one organisation listed, taking the first.')
            ds.publisher = orgs[0]["web_url"]
    if 'details' in doc_info and 'attachments' in doc_info['details']:
        distributions = []
        for attachment in doc_info['details']['attachments']:
            dist = Distribution(scraper)
            if 'url' in attachment:
                dist.downloadURL = urljoin('https://www.gov.uk/',
                                           attachment['url'])
            if 'title' in attachment:
                dist.title = attachment['title']
            if 'file_size' in attachment:
                dist.byteSize = attachment['file_size']
            if 'content_type' in attachment:
                dist.mediaType = attachment['content_type']
            distributions.append(dist)
        ds.distribution = distributions
    elif 'details' in doc_info and 'documents' in doc_info['details']:
        distributions = []
        for link in doc_info['details']['documents']:
            link_tree = html.fromstring(link)
            extract_distributions(distributions, link_tree, scraper)
        ds.distribution = distributions
    return ds
示例#5
0
文件: nisra.py 项目: jwestw/gss-utils
def scrape(scraper, tree):
    scraper.dataset.title = tree.xpath("//h1/text()")[0].strip()
    scraper.dataset.issued = parse(
        tree.xpath(
            "//span[text() = 'Date published: ']/following-sibling::span/text()",
            dayfirst=True)[0].strip()).date()
    scraper.dataset.keyword = ', '.join(
        tree.xpath(
            "//div[text()='Statistics: ']/following-sibling::ul/li/a/text()"))
    scraper.dataset.description = scraper.to_markdown(
        tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' publicationDetails ')]/div[@class='summary']/div/*"
        ))
    scraper.dataset.publisher = str(
        GOV["northern-ireland-statistics-and-research-agency"])
    for anchor in tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' publicationDocs ')]/div[@class='summary']/div//a"
    ):
        dist = Distribution(scraper)
        dist.title = anchor.xpath('text()')[0].strip()
        dist.downloadURL = anchor.get('href')
        type_size_re = re.compile(r'(.*?)\s*\(([^)]+)\)')
        m = type_size_re.match(anchor.xpath('span/text()')[0].strip())
        if m:
            if m.group(1) == 'Excel':
                dist.mediaType = Excel
            else:
                dist.mediaType, encoding = mimetypes.guess_type(
                    dist.downloadURL)
            size = m.group(2)
            if size.strip() != '':
                if size.upper().endswith(
                        ' KB'
                ):  # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                    dist.byteSize = int(float(size[:-3]) * 1024)
                elif size.upper().endswith(
                        ' MB'
                ):  # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    dist.byteSize = int(float(size[:-3]) * 1000000)
        scraper.distributions.append(dist)
示例#6
0
def content_api_sds(scraper, metadata):
    # publications are in the details/body HTML
    # they look to be a collection of datasets

    if 'title' in metadata:
        scraper.catalog.title = metadata['title']
    if 'description' in metadata:
        scraper.catalog.description = metadata['description']
    if 'first_published_at' in metadata:
        scraper.catalog.issued = datetime.fromisoformat(metadata['first_published_at'])
    if 'public_updated_at' in metadata:
        scraper.catalog.modified = datetime.fromisoformat(metadata['public_updated_at'])
    if 'links' in metadata and 'organisations' in metadata['links']:
        orgs = metadata['links']['organisations']
        if len(orgs) == 0:
            logging.warning("No publishing organisations listed.")
        elif len(orgs) >= 1:
            if len(orgs) > 1:
                logging.warning('More than one organisation listed, taking the first.')
            scraper.catalog.publisher = orgs[0]["web_url"]
    scraper.catalog.dataset = []
    if 'details' in metadata and 'body' in metadata['details']:
        body_tree = html.fromstring(metadata['details']['body'])
        # look for the same HTML as is used in content_api_publication yet here
        # joined into one blob
        sections = body_tree.xpath("//section[contains(concat(' ', @class, ' '), ' attachment ')]")
        if len(sections) > 0:
            ds = Dataset(scraper.uri)
            ds.title = scraper.catalog.title
            ds.description = scraper.catalog.description
            ds.publisher = scraper.catalog.publisher
            ds.issued = scraper.catalog.issued
            ds.modified = scraper.catalog.modified
            email_anchor = next(iter(body_tree.xpath("//a[@class='email']")), None)
            if email_anchor is not None:
                ds.contactPoint = email_anchor.get('href')
            ds.distribution = []
            for link_tree in sections:
                extract_distributions(ds.distribution, link_tree, scraper)
            scraper.catalog.dataset.append(ds)
            scraper.select_dataset(latest=True)
        else:
            for heading in body_tree.xpath("//h2[following-sibling::p/descendant::span[@class='attachment-inline']]"):
                id = heading.get('id')
                ds = Dataset(scraper.uri)
                ds.title = heading.text
                ds.description = scraper.catalog.description
                ds.publisher = scraper.catalog.publisher
                ds.issued = scraper.catalog.issued
                ds.modified = scraper.catalog.modified
                email_anchor = next(iter(body_tree.xpath("//a[@class='email']")), None)
                if email_anchor is not None:
                    ds.contactPoint = email_anchor.get('href')
                ds.distribution = []
                for attachment in body_tree.xpath(f"//h2[@id='{id}']/" + \
                                                  f"following-sibling::p[preceding-sibling::h2[1][@id='{id}']]/" + \
                                                  "span[@class='attachment-inline']"):
                    dist = Distribution(scraper)
                    dist.title = next(iter(attachment.xpath("a/text()")), None)
                    dist.downloadURL = next(iter(attachment.xpath("a/@href")), None)
                    dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
                    abbr = next(iter(attachment.xpath("descendant::abbr/text()")), None)
                    if abbr is not None:
                        if abbr.upper() == 'PDF':
                            dist.mediaType = PDF
                        elif abbr.upper() == 'ODS':
                            dist.mediaType = ODS
                    size = next(iter(attachment.xpath("descendant::span[@class='file-size']/text()")), None)
                    if size is not None:
                        if size.endswith('KB'):
                            dist.byteSize = int(float(size[:-2]) * 1024)
                        elif size.endswith('kB'):
                            dist.byteSize = int(float(size[:-2]) * 1000)
                        elif size.endswith('MB'):
                            dist.byteSize = int(float(size[:-2]) * 1000000)
                    ds.distribution.append(dist)
                scraper.catalog.dataset.append(ds)
示例#7
0
def scrape(scraper, tree):
    page_type = tree.xpath(
        "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()"
    )[0]

    if page_type.strip() == 'Series / Collection':
        scraper.catalog.title = tree.xpath("//h1/text()")[0]
        scraper.catalog.uri = scraper.uri + '#catalog'
        scraper.catalog.publisher = GOV['nhs-digital']
        scraper.catalog.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
        scraper.catalog.rights = 'https://digital.nhs.uk/about-nhs-digital/terms-and-conditions'
        scraper.catalog.comment = ' '.join(
            tree.xpath(
                "//div[@id='section-summary']/div[@itemprop='description']/*/text()"
            ))
        scraper.catalog.dataset = []

        articles = tree.xpath("//article[@class='cta']")
        for article in articles:
            dataset = Dataset(scraper.uri)
            dataset.distribution = []
            dataset.publisher = scraper.catalog.publisher
            dataset.license = scraper.catalog.license
            article_link = article.xpath('descendant::a')[0]
            dataset.title = article_link.get('title')
            href = article_link.get('href')
            dataset.landingPage = urljoin(scraper.uri, href)
            article_tree = html.fromstring(
                scraper.session.get(dataset.landingPage).text)
            article_type = article_tree.xpath(
                "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()"
            )[0]

            assert article_type.startswith(
                'Publication'), 'Expecting publication'

            details_node = article_tree.xpath("//dl[@class='detail-list']")[0]
            details = {}

            for node in details_node:
                if node.tag == 'dt' and node.get(
                        'class') == 'detail-list__key':
                    key = node.text.strip().lower()
                    if key.endswith(':'):
                        key = key[:-1].strip()
                elif node.tag == 'dd' and node.get(
                        'class') == 'detail-list__value':
                    value = node.text.strip()
                    if key not in details:
                        details[key] = [value]
                    else:
                        details[key].append(value)

            if 'publication date' in details:
                dataset.issued = parse(details['publication date'][0],
                                       dayfirst=True)

            # Todo: spatiotemporal coverage and resolution/granularity
            # Todo: national statistics / official statistics badges

            resources = article_tree.xpath(
                "//ul[@data-uipath='ps.publication.resources-attachments']/li/a"
            )

            for link in resources:
                dist = Distribution(scraper)
                dist.title = link.get('title')

                if hasattr(dataset, 'issued'):
                    dist.issued = dataset.issued

                dist.downloadURL = urljoin(dataset.landingPage,
                                           link.get('href'))
                file_data = link.xpath("div[@class='block-link__body']")[0]
                dist.mediaType = str(file_data.xpath("meta/@content")[0])
                size = file_data.xpath(
                    "span/span[@class='fileSize']/span[@itemprop='contentSize']/text()"
                )[0]
                size_match = re.match(r'([0-9]+(\.[0-9]*)?)\s*(kB|MB|GB)',
                                      size)

                if size_match and size_match.group(3) == 'kB':
                    # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                    dist.byteSize = int(float(size_match.group(1)) * 1000)
                elif size_match and size_match.group(3) == 'MB':
                    # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    dist.byteSize = int(float(size_match.group(1)) * 1000000)
                elif size_match and size_match.group(3) == 'GB':
                    # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes, GiB = 2^30 bytes
                    dist.byteSize = int(
                        float(size_match.group(1)) * 1000000000)
                dataset.distribution.append(dist)

            scraper.catalog.dataset.append(dataset)
示例#8
0
def scrape(scraper, tree):
    size_re = re.compile(r'\[([0-9]+)(kb|Mb)\]')
    scraper.catalog.title = tree.xpath('//h2/text()')[0].strip()
    scraper.catalog.uri = scraper.uri + "#catalog"
    scraper.catalog.rights = 'http://www.isdscotland.org/Copyright.asp'
    scraper.catalog.publisher = GOV['information-services-division-scotland']
    title2dataset = {}

    @lru_cache()
    def fetch_page(url):
        page = scraper.session.get(url)
        return html.fromstring(page.text)

    for record in tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' pubtitlel ')]"):
        dataset_title = record.text.strip()
        if dataset_title not in title2dataset:
            dataset = Dataset(scraper.uri)
            dataset.title = dataset_title
            dataset.publisher = scraper.catalog.publisher
            dataset.rights = scraper.catalog.rights
            dataset.distribution = []
            title2dataset[dataset_title] = dataset
        else:
            dataset = title2dataset[dataset_title]

        datatables_urls = record.xpath(
            "following-sibling::table/descendant::tr[td["
            "contains(text(), 'Data Tables')]]/td["
            "contains(concat(' ', @class, ' '), 'pubcontentr')]/a/@href")
        if len(datatables_urls) == 0:
            continue
        doc_url, frag = urldefrag(urljoin(scraper.uri, datatables_urls[0]))
        # pages appear to have redundant query parameter the same as the fragment id
        doc_url_bits = urlparse(doc_url)
        if doc_url_bits.query is not None and doc_url_bits.query == f'id={frag}':
            doc_url = doc_url_bits._replace(query=None).geturl()
        doc_tree = fetch_page(doc_url)
        anchors = doc_tree.xpath(f"//a[@id='{frag}' or @name='{frag}']")
        if len(anchors) == 0:
            logging.warning(f"Broken link to dataset {datatables_urls[0]}")
            continue

        # publication date is in paragraph before!
        # this is actually the issued date of the distribution
        published = anchors[0].xpath(
            "../preceding-sibling::p[1]/child::*/text()")
        dist_issued = None
        if len(published) > 0 and published[0].startswith('Published '):
            dist_issued = parse(published[0][len('Published '):],
                                dayfirst=True)
            # we'll use the latest publication date for the dataset
            if not (hasattr(dataset, 'issued')
                    and dist_issued <= dataset.issued):
                dataset.issued = dist_issued
        dist_rows = anchors[0].xpath(
            "../following-sibling::table[1]/descendant::tr")
        for row in dist_rows:
            distribution = Distribution(scraper)
            cells = row.xpath('td')
            if len(cells) == 4:
                title_node, download_node, type_node, size_node = cells
            elif len(cells) == 3:
                title_node, download_node, type_node = cells
                size_node = None
            else:
                break
            distribution.title = title_node.text
            if dist_issued is not None:
                distribution.issued = dist_issued
            distribution.downloadURL = download_node[0].get('href')
            type_image = type_node[0].get('src').lower()
            if 'excel' in type_image:
                distribution.mediaType = Excel
            elif 'swf' in type_image:
                distribution.mediaType = 'application/vnd.adobe.flash.movie'
            else:
                distribution.mediaType, encoding = mimetypes.guess_type(
                    distribution.downloadURL)
            if size_node is not None and size_node.text is not None:
                size_match = size_re.match(size_node.text)
                if size_match:
                    if size_match.group(2) == 'Mb':  # should be MB
                        distribution.byteSize = int(
                            size_match.group(1)
                        ) * 1000000  # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    elif size_match.group(
                            2
                    ) == 'kb':  # should be either kB or KB    https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                        distribution.byteSize = int(size_match.group(1)) * 1000
            dataset.distribution.append(distribution)

    scraper.catalog.dataset = list(title2dataset.values())