def scrape(scraper, tree): scraper.dataset.title = tree.xpath("//h1/text()")[0].strip() scraper.dataset.issued = parse(tree.xpath( "//p[contains(concat(' ', @class, ' '), ' date-pub ')]/span[@class='date-display-single']/text()")[0], dayfirst=True).date() scraper.dataset.publisher = GOV['department-of-health-northern-ireland'] for doc_link in tree.xpath( "//div[contains(concat(' ', @class, ' '), ' publicationDocs ')]" "//div[contains(concat(' ', @class, ' '), ' nigovfile ')]/a"): dist = Distribution(scraper) dist.downloadURL = doc_link.get('href') dist.title = doc_link.xpath("text()")[0].strip() type_size = doc_link.xpath("span[@class='meta']/text()")[0].strip() match = type_size_re.match(type_size) if match: if match.group(1) == 'PDF': dist.mediaType = PDF else: dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL) size = float(match.group(2)) if match.group(3) == 'KB': # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 dist.byteSize = int(size * 1024) elif match.group(3) == 'MB': # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes dist.byteSize = int(size * 1000000) elif match.group(3) == 'GB': # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes dist.byteSize = int(size * 1000000000) scraper.distributions.append(dist)
def extract_distributions(distributions, link_tree, scraper): div_attach = next(iter(link_tree.xpath("div[@class='attachment-details']")), None) if div_attach is not None: div_metadata = next(iter(div_attach.xpath("p[@class='metadata']")), None) if div_metadata is not None: span_type = next(iter(div_metadata.xpath("span[@class='type']")), None) if span_type is not None: span_size = next(iter(div_metadata.xpath("span[@class='file-size']/text()")), None) if span_size is not None: dist = Distribution(scraper) # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes if span_size.endswith('KB'): dist.byteSize = int(float(span_size[:-2]) * 1024) elif span_size.endswith('kB'): dist.byteSize = int(float(span_size[:-2]) * 1000) elif span_size.endswith('MB'): dist.byteSize = int(float(span_size[:-2]) * 1000000) anchor = next(iter(div_attach.xpath("h2/a")), None) if anchor is not None: url = anchor.get('href') if url is not None: dist.downloadURL = urljoin('https://www.gov.uk/', url) if hasattr(anchor, 'text'): dist.title = anchor.text.strip() dist.mediaType, encoding = mimetypes.guess_type(dist.downloadURL) abbr_type = next(iter(span_type.xpath("abbr/text()")), None) if abbr_type is not None: if abbr_type.upper() == 'PDF': dist.mediaType = PDF distributions.append(dist)
def scrape(scraper, tree): # It's not clear whether the pages are collections of datasets or datasets with distributions. # Assume the latter for simplicity for now. scraper.dataset.publisher = GOV['welsh-government'] # OGLv3 license is quoted for the whole site on https://gov.wales/copyright-statement scraper.dataset.rights = "https://gov.wales/copyright-statement" scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' scraper.dataset.title = tree.xpath('//h1//text()')[0].strip() scraper.dataset.description = tree.xpath( "//div[contains(concat(' ', @class, ' '), ' hero-block__summary ')]/div/p/text()" )[0].strip() meta = tree.xpath("//div[@class='header-meta']")[0] published = meta.xpath( "div[contains(concat(' ', @class, ' '), ' first-published ')]/" + \ "div[contains(concat(' ', @class, ' '), ' item ')]/text()")[0].strip() scraper.dataset.issued = parse(published, dayfirst=True) updated = meta.xpath( "div[contains(concat(' ', @class, ' '), ' last-updated ')]/" + \ "div[contains(concat(' ', @class, ' '), ' item ')]//time/@datetime")[0].strip() scraper.dataset.modified = isoparse(updated) @lru_cache() def fetch_page(url): page = scraper.session.get(url) return html.fromstring(page.text) for article in tree.xpath("//div[@role='article']"): title_div = article.xpath("div[@class = 'index-list__title']")[0] meta_div = article.xpath("div[@class = 'index-list__meta']")[0] release_page = fetch_page(title_div.xpath('a/@href')[0]) for details in release_page.xpath( "//div[@id = 'release--data']//div[@class = 'document__details']" ): distribution = Distribution(scraper) distribution.downloadURL = details.xpath("h3/a/@href")[0] distribution.title = details.xpath("h3/a/div/text()")[0].strip() distribution.issued = isoparse( details.xpath( "//div[contains(concat(' ', @class, ' '), ' meta__released ')]//time/@datetime" )[0]) distribution.modified = isoparse( details.xpath( "//div[contains(concat(' ', @class, ' '), ' meta__update_history ')]//time/@datetime" )[0]) dist_meta = details.xpath("h3/a/span/text()")[0].strip() meta_match = FILE_TYPE_AND_SIZE_RE.match(dist_meta) if meta_match: distribution.mediaType = {'ODS': ODS}.get(meta_match.group(1)) size_qualifier = meta_match.group(3) size = float(meta_match.group(2)) if size_qualifier == "KB": distribution.byteSize = int(size * 1024) elif size_qualifier == "kB": distribution.byteSize = int(size * 1000) else: distribution.mediaType, _ = mimetypes.guess_type( distribution.downloadURL) scraper.distributions.append(distribution)
def content_api_publication(scraper, metadata): ds = Dataset(scraper.uri) if 'title' in metadata: ds.title = metadata['title'] if 'description' in metadata: ds.comment = metadata['description'] if 'details' in metadata: # TODO, depends on outcome of https://github.com/GSS-Cogs/gss-utils/issues/308 ds.description = html2text.html2text(metadata["details"]["body"]) if 'api_url' in metadata: doc_info = scraper.session.get(metadata['api_url']).json() else: doc_info = metadata if 'first_published_at' in doc_info: ds.issued = datetime.fromisoformat(doc_info['first_published_at']) if 'public_updated_at' in doc_info: ds.modified = datetime.fromisoformat(doc_info['public_updated_at']) if 'description' in doc_info: ds.comment = doc_info['description'] if 'description' in doc_info: # TODO, depends on outcome of https://github.com/GSS-Cogs/gss-utils/issues/308 ds.description = html2text.html2text(doc_info["details"]["body"]) if 'links' in doc_info and 'organisations' in doc_info['links']: orgs = doc_info['links']['organisations'] if len(orgs) == 0: logging.warning("No publishing organisations listed.") elif len(orgs) >= 1: if len(orgs) > 1: logging.warning( 'More than one organisation listed, taking the first.') ds.publisher = orgs[0]["web_url"] if 'details' in doc_info and 'attachments' in doc_info['details']: distributions = [] for attachment in doc_info['details']['attachments']: dist = Distribution(scraper) if 'url' in attachment: dist.downloadURL = urljoin('https://www.gov.uk/', attachment['url']) if 'title' in attachment: dist.title = attachment['title'] if 'file_size' in attachment: dist.byteSize = attachment['file_size'] if 'content_type' in attachment: dist.mediaType = attachment['content_type'] distributions.append(dist) ds.distribution = distributions elif 'details' in doc_info and 'documents' in doc_info['details']: distributions = [] for link in doc_info['details']['documents']: link_tree = html.fromstring(link) extract_distributions(distributions, link_tree, scraper) ds.distribution = distributions return ds
def scrape(scraper, tree): scraper.dataset.title = tree.xpath("//h1/text()")[0].strip() scraper.dataset.issued = parse( tree.xpath( "//span[text() = 'Date published: ']/following-sibling::span/text()", dayfirst=True)[0].strip()).date() scraper.dataset.keyword = ', '.join( tree.xpath( "//div[text()='Statistics: ']/following-sibling::ul/li/a/text()")) scraper.dataset.description = scraper.to_markdown( tree.xpath( "//div[contains(concat(' ', @class, ' '), ' publicationDetails ')]/div[@class='summary']/div/*" )) scraper.dataset.publisher = str( GOV["northern-ireland-statistics-and-research-agency"]) for anchor in tree.xpath( "//div[contains(concat(' ', @class, ' '), ' publicationDocs ')]/div[@class='summary']/div//a" ): dist = Distribution(scraper) dist.title = anchor.xpath('text()')[0].strip() dist.downloadURL = anchor.get('href') type_size_re = re.compile(r'(.*?)\s*\(([^)]+)\)') m = type_size_re.match(anchor.xpath('span/text()')[0].strip()) if m: if m.group(1) == 'Excel': dist.mediaType = Excel else: dist.mediaType, encoding = mimetypes.guess_type( dist.downloadURL) size = m.group(2) if size.strip() != '': if size.upper().endswith( ' KB' ): # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 dist.byteSize = int(float(size[:-3]) * 1024) elif size.upper().endswith( ' MB' ): # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes dist.byteSize = int(float(size[:-3]) * 1000000) scraper.distributions.append(dist)
def content_api_sds(scraper, metadata): # publications are in the details/body HTML # they look to be a collection of datasets if 'title' in metadata: scraper.catalog.title = metadata['title'] if 'description' in metadata: scraper.catalog.description = metadata['description'] if 'first_published_at' in metadata: scraper.catalog.issued = datetime.fromisoformat(metadata['first_published_at']) if 'public_updated_at' in metadata: scraper.catalog.modified = datetime.fromisoformat(metadata['public_updated_at']) if 'links' in metadata and 'organisations' in metadata['links']: orgs = metadata['links']['organisations'] if len(orgs) == 0: logging.warning("No publishing organisations listed.") elif len(orgs) >= 1: if len(orgs) > 1: logging.warning('More than one organisation listed, taking the first.') scraper.catalog.publisher = orgs[0]["web_url"] scraper.catalog.dataset = [] if 'details' in metadata and 'body' in metadata['details']: body_tree = html.fromstring(metadata['details']['body']) # look for the same HTML as is used in content_api_publication yet here # joined into one blob sections = body_tree.xpath("//section[contains(concat(' ', @class, ' '), ' attachment ')]") if len(sections) > 0: ds = Dataset(scraper.uri) ds.title = scraper.catalog.title ds.description = scraper.catalog.description ds.publisher = scraper.catalog.publisher ds.issued = scraper.catalog.issued ds.modified = scraper.catalog.modified email_anchor = next(iter(body_tree.xpath("//a[@class='email']")), None) if email_anchor is not None: ds.contactPoint = email_anchor.get('href') ds.distribution = [] for link_tree in sections: extract_distributions(ds.distribution, link_tree, scraper) scraper.catalog.dataset.append(ds) scraper.select_dataset(latest=True) else: for heading in body_tree.xpath("//h2[following-sibling::p/descendant::span[@class='attachment-inline']]"): id = heading.get('id') ds = Dataset(scraper.uri) ds.title = heading.text ds.description = scraper.catalog.description ds.publisher = scraper.catalog.publisher ds.issued = scraper.catalog.issued ds.modified = scraper.catalog.modified email_anchor = next(iter(body_tree.xpath("//a[@class='email']")), None) if email_anchor is not None: ds.contactPoint = email_anchor.get('href') ds.distribution = [] for attachment in body_tree.xpath(f"//h2[@id='{id}']/" + \ f"following-sibling::p[preceding-sibling::h2[1][@id='{id}']]/" + \ "span[@class='attachment-inline']"): dist = Distribution(scraper) dist.title = next(iter(attachment.xpath("a/text()")), None) dist.downloadURL = next(iter(attachment.xpath("a/@href")), None) dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL) abbr = next(iter(attachment.xpath("descendant::abbr/text()")), None) if abbr is not None: if abbr.upper() == 'PDF': dist.mediaType = PDF elif abbr.upper() == 'ODS': dist.mediaType = ODS size = next(iter(attachment.xpath("descendant::span[@class='file-size']/text()")), None) if size is not None: if size.endswith('KB'): dist.byteSize = int(float(size[:-2]) * 1024) elif size.endswith('kB'): dist.byteSize = int(float(size[:-2]) * 1000) elif size.endswith('MB'): dist.byteSize = int(float(size[:-2]) * 1000000) ds.distribution.append(dist) scraper.catalog.dataset.append(ds)
def scrape(scraper, tree): page_type = tree.xpath( "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()" )[0] if page_type.strip() == 'Series / Collection': scraper.catalog.title = tree.xpath("//h1/text()")[0] scraper.catalog.uri = scraper.uri + '#catalog' scraper.catalog.publisher = GOV['nhs-digital'] scraper.catalog.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' scraper.catalog.rights = 'https://digital.nhs.uk/about-nhs-digital/terms-and-conditions' scraper.catalog.comment = ' '.join( tree.xpath( "//div[@id='section-summary']/div[@itemprop='description']/*/text()" )) scraper.catalog.dataset = [] articles = tree.xpath("//article[@class='cta']") for article in articles: dataset = Dataset(scraper.uri) dataset.distribution = [] dataset.publisher = scraper.catalog.publisher dataset.license = scraper.catalog.license article_link = article.xpath('descendant::a')[0] dataset.title = article_link.get('title') href = article_link.get('href') dataset.landingPage = urljoin(scraper.uri, href) article_tree = html.fromstring( scraper.session.get(dataset.landingPage).text) article_type = article_tree.xpath( "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()" )[0] assert article_type.startswith( 'Publication'), 'Expecting publication' details_node = article_tree.xpath("//dl[@class='detail-list']")[0] details = {} for node in details_node: if node.tag == 'dt' and node.get( 'class') == 'detail-list__key': key = node.text.strip().lower() if key.endswith(':'): key = key[:-1].strip() elif node.tag == 'dd' and node.get( 'class') == 'detail-list__value': value = node.text.strip() if key not in details: details[key] = [value] else: details[key].append(value) if 'publication date' in details: dataset.issued = parse(details['publication date'][0], dayfirst=True) # Todo: spatiotemporal coverage and resolution/granularity # Todo: national statistics / official statistics badges resources = article_tree.xpath( "//ul[@data-uipath='ps.publication.resources-attachments']/li/a" ) for link in resources: dist = Distribution(scraper) dist.title = link.get('title') if hasattr(dataset, 'issued'): dist.issued = dataset.issued dist.downloadURL = urljoin(dataset.landingPage, link.get('href')) file_data = link.xpath("div[@class='block-link__body']")[0] dist.mediaType = str(file_data.xpath("meta/@content")[0]) size = file_data.xpath( "span/span[@class='fileSize']/span[@itemprop='contentSize']/text()" )[0] size_match = re.match(r'([0-9]+(\.[0-9]*)?)\s*(kB|MB|GB)', size) if size_match and size_match.group(3) == 'kB': # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 dist.byteSize = int(float(size_match.group(1)) * 1000) elif size_match and size_match.group(3) == 'MB': # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes dist.byteSize = int(float(size_match.group(1)) * 1000000) elif size_match and size_match.group(3) == 'GB': # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes, GiB = 2^30 bytes dist.byteSize = int( float(size_match.group(1)) * 1000000000) dataset.distribution.append(dist) scraper.catalog.dataset.append(dataset)
def scrape(scraper, tree): size_re = re.compile(r'\[([0-9]+)(kb|Mb)\]') scraper.catalog.title = tree.xpath('//h2/text()')[0].strip() scraper.catalog.uri = scraper.uri + "#catalog" scraper.catalog.rights = 'http://www.isdscotland.org/Copyright.asp' scraper.catalog.publisher = GOV['information-services-division-scotland'] title2dataset = {} @lru_cache() def fetch_page(url): page = scraper.session.get(url) return html.fromstring(page.text) for record in tree.xpath( "//div[contains(concat(' ', @class, ' '), ' pubtitlel ')]"): dataset_title = record.text.strip() if dataset_title not in title2dataset: dataset = Dataset(scraper.uri) dataset.title = dataset_title dataset.publisher = scraper.catalog.publisher dataset.rights = scraper.catalog.rights dataset.distribution = [] title2dataset[dataset_title] = dataset else: dataset = title2dataset[dataset_title] datatables_urls = record.xpath( "following-sibling::table/descendant::tr[td[" "contains(text(), 'Data Tables')]]/td[" "contains(concat(' ', @class, ' '), 'pubcontentr')]/a/@href") if len(datatables_urls) == 0: continue doc_url, frag = urldefrag(urljoin(scraper.uri, datatables_urls[0])) # pages appear to have redundant query parameter the same as the fragment id doc_url_bits = urlparse(doc_url) if doc_url_bits.query is not None and doc_url_bits.query == f'id={frag}': doc_url = doc_url_bits._replace(query=None).geturl() doc_tree = fetch_page(doc_url) anchors = doc_tree.xpath(f"//a[@id='{frag}' or @name='{frag}']") if len(anchors) == 0: logging.warning(f"Broken link to dataset {datatables_urls[0]}") continue # publication date is in paragraph before! # this is actually the issued date of the distribution published = anchors[0].xpath( "../preceding-sibling::p[1]/child::*/text()") dist_issued = None if len(published) > 0 and published[0].startswith('Published '): dist_issued = parse(published[0][len('Published '):], dayfirst=True) # we'll use the latest publication date for the dataset if not (hasattr(dataset, 'issued') and dist_issued <= dataset.issued): dataset.issued = dist_issued dist_rows = anchors[0].xpath( "../following-sibling::table[1]/descendant::tr") for row in dist_rows: distribution = Distribution(scraper) cells = row.xpath('td') if len(cells) == 4: title_node, download_node, type_node, size_node = cells elif len(cells) == 3: title_node, download_node, type_node = cells size_node = None else: break distribution.title = title_node.text if dist_issued is not None: distribution.issued = dist_issued distribution.downloadURL = download_node[0].get('href') type_image = type_node[0].get('src').lower() if 'excel' in type_image: distribution.mediaType = Excel elif 'swf' in type_image: distribution.mediaType = 'application/vnd.adobe.flash.movie' else: distribution.mediaType, encoding = mimetypes.guess_type( distribution.downloadURL) if size_node is not None and size_node.text is not None: size_match = size_re.match(size_node.text) if size_match: if size_match.group(2) == 'Mb': # should be MB distribution.byteSize = int( size_match.group(1) ) * 1000000 # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes elif size_match.group( 2 ) == 'kb': # should be either kB or KB https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 distribution.byteSize = int(size_match.group(1)) * 1000 dataset.distribution.append(distribution) scraper.catalog.dataset = list(title2dataset.values())