def handler_dataset_landing_page_fallback(scraper, this_dataset_page, tree): """ At time of writing there's an issue with the latest version of datasets 404'ing on the versions page. this function will create what the latest version should be, using the information on the base dataset landing page. """ logging.warning( "Using fallback logic to scrape latest distribution from dataset landing page (rather " "than previous page). This scrape will only have a single distribution of xls." ) this_distribution = Distribution(scraper) release_date = this_dataset_page["description"]["releaseDate"] this_distribution.issued = parse(release_date.strip()).date() # gonna have to go via html ... download_url = tree.xpath("//a[text()='xls']/@href") this_distribution.downloadURL = download_url media_type = Excel this_distribution.mediaType = media_type this_distribution.title = scraper.dataset.title this_distribution.description = scraper.dataset.description this_distribution.contactPoint = scraper.dataset.contactPoint logging.debug( "Created distribution for download '{}'.".format(download_url)) scraper.distributions.append(this_distribution)
def scrape(scraper, tree): """ This is a scraper intended to use the ONS cmd (customise my data) functionality. :param scraper: the Scraper object :param landing_page: lxml tree :return: """ dataset_document = request_json_data(scraper, scraper.uri) scraper.dataset.title = dataset_document["id"] scraper.dataset.description = dataset_document["description"] # Need to get issued from the assciated publication publication_document = request_json_data( scraper, dataset_document["publications"][0]["href"] + "/data") scraper.dataset.issued = parse( publication_document["description"]["releaseDate"]) # Only take next release it its a date try: next_release = parse(dataset_document["next_release"]) scraper.dataset.updateDueOn = next_release except: pass # it's fine, "unknown" etc # Theoretically you can have more than one contact, but I'm just taking the first scraper.dataset.contactPoint = "mailto:" + dataset_document["contacts"][0][ "email"].strip() scraper.dataset.publisher = 'https://www.gov.uk/government/organisations/office-for-national-statistics' scraper.dataset.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/" edition_documents = request_json_data(scraper, scraper.uri + "/editions") for edition_document in edition_documents["items"]: edition_name = edition_document["edition"] version_documents = request_json_data( scraper, edition_document["links"]["versions"]["href"]) for version_document in version_documents["items"]: version_name = str(version_document["version"]) this_distribution = Distribution(scraper) this_distribution.issued = version_document["release_date"] this_distribution.downloadURL = version_document["downloads"][ "csv"]["href"] this_distribution.mediaType = CSV this_distribution.title = scraper.dataset.title + ", {}, version {}".format( edition_name, version_name) this_distribution.description = scraper.dataset.description this_distribution.contactPoint = scraper.dataset.contactPoint logging.debug("Created distribution for download '{}'.".format( this_distribution.downloadURL)) scraper.distributions.append(this_distribution)