def handle(self, *args, **kwargs): self.verbose = (kwargs['verbosity'] > 1) date = datetime.date(kwargs['year'], kwargs['month'], 1) datestamp = date.strftime('%Y_%m') source_url = self.url_for_date(date) if source_url is None: raise CommandError('Could not find any data for %s' % datestamp) target_dir = os.path.join( settings.PIPELINE_DATA_BASEDIR, 'patient_list_size', datestamp, ) target_file = os.path.join(target_dir, 'patient_list_size_new.csv') mkdir_p(target_dir) if self.verbose: print 'Getting data for {}'.format(datestamp) self.get_data(target_file, source_url) if self.verbose: print "Done"
def handle(self, *args, **kwargs): self.verbose = kwargs["verbosity"] > 1 date = datetime.date(kwargs["year"], kwargs["month"], 1) datestamp = date.strftime("%Y_%m") url = date.strftime( "https://digital.nhs.uk/data-and-information/publications/statistical/patients-registered-at-a-gp-practice/%B-%Y" ).lower() rsp = requests.get(url) if rsp.status_code != 200: raise CommandError("Could not find any data for %s" % datestamp) filename = "gp-reg-pat-prac-quin-age.csv" tree = html.fromstring(rsp.content) source_url = tree.xpath( "//a[contains(@href, '{}')]/@href".format(filename))[0] target_dir = os.path.join(settings.PIPELINE_DATA_BASEDIR, "patient_list_size", datestamp) target_file = os.path.join(target_dir, "patient_list_size_new.csv") mkdir_p(target_dir) if self.verbose: print("Getting data for {}".format(datestamp)) self.curl_and_return(source_url, target_file) if self.verbose: print("Done")
def setUpTestData(cls): for bnf_code, name in [ ("0203020C0AAAAAA", "Adenosine_I/V Inf 3mg/ml 2ml Vl"), ("1003020U0AAAIAI", "Diclofenac Sod_Gel 2.32%"), ("1003020U0BBADAI", "Voltarol 12 Hour Emulgel P_Gel 2.32%"), ("1305020C0AAFVFV", "Coal Tar 10%/Salic Acid 5%/Aq_Crm"), ("1106000X0AAAIAI", "Piloc HCl_Eye Dps 6%"), ("090402000BBHCA0", "Nutrison Pack_Stnd"), ]: Presentation.objects.create(bnf_code=bnf_code, name=name) shutil.copytree( "dmd2/tests/data/dmd/1", "pipeline/test-data/data/dmd/2019_07_01/nhsbsa_dmd_7.4.0_20190701000001", ) mkdir_p("pipeline/test-data/data/snomed_mapping/2019_07_01") shutil.copyfile( "dmd2/tests/data/bnf_code_mapping/mapping.xlsx", "pipeline/test-data/data/snomed_mapping/2019_07_01/mapping.xlsx", ) # Import the data. See dmd2/tests/data/README.txt for details of what # objects will be created. with patch("gcutils.bigquery.Client.upload_model"): call_command("import_dmd2") # Copy another, later, dataset into the data directory, for tests that # call the command again. shutil.copytree( "dmd2/tests/data/dmd/2", "pipeline/test-data/data/dmd/2019_07_08/nhsbsa_dmd_7.4.0_20190708000001", )
def handle(self, *args, **kwargs): self.verbose = (kwargs['verbosity'] > 1) date = datetime.date(kwargs['year'], kwargs['month'], 1) datestamp = date.strftime('%Y_%m') url = date.strftime('http://digital.nhs.uk/pubs/numpatgp%b%y').lower() rsp = requests.get(url) if rsp.status_code != 200: raise CommandError('Could not find any data for %s' % datestamp) filename = date.strftime('gp-reg-pat-prac-quin-age-%b-%y').lower() tree = html.fromstring(rsp.content) source_url = tree.xpath( "//a[contains(@href, '{}')]/@href".format(filename))[0] target_dir = os.path.join( settings.PIPELINE_DATA_BASEDIR, 'patient_list_size', datestamp, ) target_file = os.path.join(target_dir, 'patient_list_size_new.csv') mkdir_p(target_dir) if self.verbose: print 'Getting data for {}'.format(datestamp) self.curl_and_return(source_url, target_file) if self.verbose: print "Done"
def handle(self, year, month, **kwargs): rsp = requests.get( "https://opendata.nhsbsa.net/api/3/action/package_show?id=english-prescribing-data-epd" ) resources = rsp.json()["result"]["resources"] urls = [ r["url"] for r in resources if r["name"] == "EPD_{year}{month:02d}".format(year=year, month=month) ] assert len(urls) == 1, urls rsp = requests.get(urls[0], stream=True) assert rsp.ok dir_path = os.path.join( settings.PIPELINE_DATA_BASEDIR, "prescribing_v2", "{year}_{month:02d}".format(year=year, month=month), ) mkdir_p(dir_path) filename = "epd_{year}{month:02d}.csv".format(year=year, month=month) with open(os.path.join(dir_path, filename), "wb") as f: for block in rsp.iter_content(32 * 1024): f.write(block)
def write_logs(self): """Record summary and details of oddities we've found in the data. We log (summary and details) the following things: * dm+d objects only present in mapping * VMPs with inferred BNF codes * VMPs without BNF codes * BNF codes with multiple dm+d objects * BNF codes with multiple dm+d objects where a name cannot be inferred * VMPPs that have different BNF code to their VMP * AMPPs that have different BNF code to their AMP We also log summaries of the number of objects imported. """ mkdir_p(self.logs_path) for key in self.log_keys: with open(os.path.join(self.logs_path, key + ".csv"), "w") as f: writer = csv.writer(f) writer.writerows(self.logs[key]) with open(os.path.join(self.logs_path, "summary.csv"), "w") as f: writer = csv.writer(f) for model in [VMP, AMP, VMPP, AMPP]: writer.writerow([model.__name__, model.objects.count()]) for key in self.log_keys: writer.writerow([key, len(self.logs[key])])
def handle(self, *args, **kwargs): path = os.path.join(settings.PIPELINE_DATA_BASEDIR, "bnf_codes") year_and_month = datetime.date.today().strftime("%Y_%m") dir_path = os.path.join(path, year_and_month) mkdir_p(dir_path) zip_path = os.path.join(dir_path, "download.zip") base_url = "https://applications.nhsbsa.nhs.uk/infosystems/data/" session = requests.Session() session.cookies["JSESSIONID"] = kwargs["jsessionid"] url = base_url + "showDataSelector.do" params = {"reportId": "126"} rsp = session.get(url, params=params) tree = html.fromstring(rsp.content) options = tree.xpath('//select[@id="bnfVersion"]/option') year_to_bnf_version = {} for option in options: datestamp, version = option.text.split(" : ") date = datetime.datetime.strptime(datestamp, "%d-%m-%Y") year_to_bnf_version[date.year] = version year = max(year_to_bnf_version) version = year_to_bnf_version[year] url = base_url + "requestSelectedDownload.do" params = { "bnfVersion": version, "filePath": "", "dataView": "260", "format": "", "defaultReportIdDataSel": "", "reportId": "126", "action": "checkForAvailableDownload", } rsp = session.get(url, params=params) request_id = rsp.json()["requestNo"] url = base_url + "downloadAvailableReport.zip" params = {"requestId": request_id} rsp = session.post(url, params=params) with open(zip_path, "wb") as f: f.write(rsp.content) with zipfile.ZipFile(zip_path) as zf: zf.extractall(dir_path) csv_paths = glob.glob(os.path.join(dir_path, "*.csv")) assert len(csv_paths) == 1 os.rename(csv_paths[0], os.path.join(dir_path, "bnf_codes.csv"))
def handle(self, *args, **kwargs): # The page lists available downloads. The data is stored in a JSON # object. url = "https://data.gov.uk/dataset/176ae264-2484-4afe-a297-d51798eb8228/gp-practice-prescribing-data-presentation-level" rsp = requests.get(url) doc = BeautifulSoup(rsp.content, "html.parser") tag = doc.find("script", type="application/ld+json") metadata = json.loads(list(tag.descendants)[0]) filename_fragment = { "addresses": "ADDR%20BNFT", "chemicals": "CHEM%20SUBS" }[kwargs["dataset"]] pattern = "T(\d{4})(\d{2})" + filename_fragment + ".CSV" urls = [ record["contentUrl"] for record in metadata["distribution"] if filename_fragment in record["contentUrl"] ] # Iterate over the URLs, newest first, downloading as we go, and # stopping once we find a URL that we have already downloaded. for url in sorted(urls, key=lambda url: url.split("/")[-1], reverse=True): filename = url.split("/")[-1] tmp_filename = filename + ".tmp" # We ignore case here, as sometimes filename is .csv and sometimes .CSV. match = re.match(pattern, filename, re.I) year_and_month = "_".join(match.groups()) dir_path = os.path.join(settings.PIPELINE_DATA_BASEDIR, "prescribing_metadata", year_and_month) if os.path.exists(os.path.join(dir_path, filename)): break # Older versions of the data have slightly different filenames. if os.path.exists( os.path.join(dir_path, filename.replace("%20", "+"))): break mkdir_p(dir_path) rsp = requests.get(url) assert rsp.ok # Since we check for the presence of the file to determine whether # this data has already been fetched, we write to a temporary file # and then rename it. with open(os.path.join(dir_path, tmp_filename), "w") as f: f.write(rsp.text) os.rename(os.path.join(dir_path, tmp_filename), os.path.join(dir_path, filename))
def handle(self, *args, **kwargs): # The page lists available downloads. The data is stored in a JSON # object. url = 'https://data.gov.uk/dataset/176ae264-2484-4afe-a297-d51798eb8228/gp-practice-prescribing-data-presentation-level' rsp = requests.get(url) doc = BeautifulSoup(rsp.content, 'html.parser') tag = doc.find('script', type='application/ld+json') metadata = json.loads(tag.text) filename_fragment = { 'addresses': 'ADDR%20BNFT', 'chemicals': 'CHEM%20SUBS', }[kwargs['dataset']] pattern = 'T(\d{4})(\d{2})' + filename_fragment + '.CSV' urls = [ record['contentUrl'] for record in metadata['distribution'] if filename_fragment in record['contentUrl'] ] # Iterate over the URLs, newest first, downloading as we go, and # stopping once we find a URL that we have already downloaded. for url in sorted(urls, key=lambda url: url.split('/')[-1], reverse=True): filename = url.split('/')[-1] tmp_filename = filename + '.tmp' # We ignore case here, as sometimes filename is .csv and sometimes .CSV. match = re.match(pattern, filename, re.I) year_and_month = '_'.join(match.groups()) dir_path = os.path.join(settings.PIPELINE_DATA_BASEDIR, 'prescribing_metadata', year_and_month) if os.path.exists(os.path.join(dir_path, filename)): break # Older versions of the data have slightly different filenames. if os.path.exists( os.path.join(dir_path, filename.replace('%20', '+'))): break mkdir_p(dir_path) rsp = requests.get(url) assert rsp.ok # Since we check for the presence of the file to determine whether # this data has already been fetched, we write to a temporary file # and then rename it. with open(os.path.join(dir_path, tmp_filename), 'w') as f: f.write(rsp.content) os.rename(os.path.join(dir_path, tmp_filename), os.path.join(dir_path, filename))
def handle(self, *args, **kwargs): base_url = "https://isd.digital.nhs.uk/" session = requests.Session() login_url = base_url + "trud3/security/j_spring_security_check" params = { "j_username": settings.TRUD_USERNAME, "j_password": settings.TRUD_PASSWORD, "commit": "LOG+IN", } rsp = session.post(login_url, params) index_url = ( base_url + "trud3/user/authenticated/group/0/pack/6/subpack/24/releases") rsp = session.get(index_url) doc = BeautifulSoup(rsp.text, "html.parser") latest_release_div = doc.find("div", class_="release") p = latest_release_div.find_all("p")[1] text = " ".join(p.text.splitlines()).strip() release_date = datetime.strptime( text, "Released on %A, %d %B %Y").strftime("%Y_%m_%d") download_href = latest_release_div.find( "a", class_="download-release")["href"] filename = download_href.split("/")[-1] dir_path = os.path.join(settings.PIPELINE_DATA_BASEDIR, "dmd", release_date) zip_path = os.path.join(dir_path, filename) unzip_dir_path = os.path.join(dir_path, os.path.splitext(filename)[0]) if os.path.exists(zip_path): return rsp = session.get(base_url + download_href, stream=True) mkdir_p(dir_path) with open(zip_path, "wb") as f: for block in rsp.iter_content(32 * 1024): f.write(block) with zipfile.ZipFile(zip_path) as zf: zf.extractall(unzip_dir_path) for nested_zip_path in glob.glob(os.path.join(unzip_dir_path, "*.zip")): with zipfile.ZipFile(nested_zip_path) as zf: zf.extractall(unzip_dir_path)
def handle(self, *args, **kwargs): today = datetime.date.today() year = today.year month = today.month num_missing_months = 0 filename_fragment = { 'addresses': 'ADDR+BNFT', 'chemicals': 'CHEM+SUBS', }[kwargs['dataset']] while True: date = datetime.date(year, month, 1) year_and_month = date.strftime('%Y_%m') # eg 2017_01 dir_path = os.path.join( settings.PIPELINE_DATA_BASEDIR, 'prescribing_metadata', year_and_month ) filename = date.strftime('T%Y%m{}.CSV').format(filename_fragment) file_path = os.path.join(dir_path, filename) if os.path.exists(file_path): break mkdir_p(dir_path) # eg http://datagov.ic.nhs.uk/presentation/2017_08_August/T201708ADDR+BNFT.CSV base_url = 'http://datagov.ic.nhs.uk/presentation' path_fragment = date.strftime('%Y_%m_%B') url = '{}/{}/{}'.format(base_url, path_fragment, filename) rsp = requests.get(url) if rsp.ok: with open(file_path, 'w') as f: f.write(rsp.content) else: num_missing_months += 1 if num_missing_months >= 6: raise CommandError('No data for six months!') if month == 1: year -= 1 month = 12 else: month -= 1
def handle(self, year, month, **kwargs): year_and_month = "{year}_{month:02d}".format(year=year, month=month) filename = "EPD_{year}{month:02d}.csv".format(year=year, month=month) dir_path = os.path.join(settings.PIPELINE_DATA_BASEDIR, "prescribing_v2", year_and_month) csv_path = os.path.join(dir_path, filename) mkdir_p(dir_path) url = "https://storage.googleapis.com/datopian-nhs/csv/" + filename rsp = requests.get(url, stream=True) assert rsp.ok with open(csv_path, "wb") as f: for block in rsp.iter_content(32 * 1024): f.write(block)
def download_csv(self, session, year_and_month, period_id): dir_path = os.path.join(self.path, year_and_month) zip_path = os.path.join(dir_path, 'download.zip') url = self.base_url + 'requestSelectedDownload.do' params = { 'period': period_id, 'filePath': '', 'dataView': '255', 'format': '', 'periodType': 'MONTHLY', 'defaultPeriod': '200', 'defaultFilterType': 'MONTHLY', 'organisation': '11', 'dimensionHierarchyId': '1', 'bnfChapter': '0', 'defaultReportIdDataSel': '', 'reportId': '124', 'action': 'checkForAvailableDownload', } rsp = session.get(url, params=params) request_id = rsp.json()['requestNo'] mkdir_p(dir_path) url = self.base_url + 'downloadAvailableReport.zip' params = { 'requestId': request_id, } rsp = session.post(url, params=params, stream=True) total_size = int(rsp.headers['content-length']) progress_bar = tqdm(total=total_size, unit='B', unit_scale=True) with open(zip_path, 'wb') as f: for block in rsp.iter_content(32 * 1024): f.write(block) progress_bar.update(len(block)) with zipfile.ZipFile(zip_path) as zf: zf.extractall(dir_path)
def download_csv(self, session, year_and_month, period_id): dir_path = os.path.join(self.path, year_and_month) zip_path = os.path.join(dir_path, "download.zip") url = self.base_url + "requestSelectedDownload.do" params = { "period": period_id, "filePath": "", "dataView": "255", "format": "", "periodType": "MONTHLY", "defaultPeriod": "200", "defaultFilterType": "MONTHLY", "organisation": "11", "dimensionHierarchyId": "1", "bnfChapter": "0", "defaultReportIdDataSel": "", "reportId": "124", "action": "checkForAvailableDownload", } rsp = session.get(url, params=params) request_id = rsp.json()["requestNo"] mkdir_p(dir_path) url = self.base_url + "downloadAvailableReport.zip" params = {"requestId": request_id} rsp = session.post(url, params=params, stream=True) total_size = int(rsp.headers["content-length"]) progress_bar = tqdm(total=total_size, unit="B", unit_scale=True) with open(zip_path, "wb") as f: for block in rsp.iter_content(32 * 1024): f.write(block) progress_bar.update(len(block)) with zipfile.ZipFile(zip_path) as zf: zf.extractall(dir_path)
def handle(self, *args, **options): if import_in_progress(): notify_slack("Not checking numbers: import in progress") return previous_log_path = get_previous_log_path() timestamp = datetime.now().strftime("%Y%m%d%H%M%S") log_path = os.path.join(settings.CHECK_NUMBERS_BASE_PATH, timestamp) mkdir_p(log_path) numbers = {} options = Options() options.headless = True with webdriver.Firefox(options=options) as browser: browser.set_page_load_timeout(60) for name, path in paths_to_scrape(): source = get_page_source(browser, path, name, log_path) numbers_list = extract_numbers(source) numbers[name] = {"path": path, "numbers": numbers_list} write_numbers(numbers, log_path) if previous_log_path is None: msg = "Not checking numbers: this is the first deploy since last import" notify_slack(msg) return previous_numbers = load_previous_numbers(previous_log_path) differences = compare_numbers(previous_numbers, numbers) if differences: msg = "The following pages have changed:\n\n" msg += "\n".join(differences) msg += "\n\nNext step: compare {} and {}".format( previous_log_path, log_path ) notify_slack(msg)
def handle(self, *args, **kwargs): base_url = "https://www.nhsbsa.nhs.uk" rsp = requests.get( base_url + "/prescription-data/understanding-our-data/bnf-snomed-mapping") doc = BeautifulSoup(rsp.text, "html.parser") urls = set(a["href"] for a in doc.find_all("a", href=True) if a["href"].endswith(".zip")) if len(urls) != 1: raise RuntimeError( "Expected exactly one zipfile URL, found {}".format(len(urls))) href = list(urls)[0] filename = href.split("/")[-1] datestamp = filename.split(".")[0].split("%20")[-1] release_date = datestamp[:4] + "_" + datestamp[4:6] + "_" + datestamp[ 6:] dir_path = os.path.join(settings.PIPELINE_DATA_BASEDIR, "bnf_snomed_mapping", release_date) zip_path = os.path.join(dir_path, filename) if glob.glob(os.path.join(dir_path, "*.xlsx")): return mkdir_p(dir_path) rsp = requests.get(base_url + href, stream=True) rsp.raise_for_status() with open(zip_path, "wb") as f: for block in rsp.iter_content(32 * 1024): f.write(block) with zipfile.ZipFile(zip_path) as zf: zf.extractall(dir_path)
def handle(self, *args, **kwargs): path = os.path.join(settings.PIPELINE_DATA_BASEDIR, 'bnf_codes') year_and_month = datetime.date.today().strftime('%Y_%m') dir_path = os.path.join(path, year_and_month) mkdir_p(dir_path) zip_path = os.path.join(dir_path, 'download.zip') base_url = 'https://apps.nhsbsa.nhs.uk/infosystems/data/' session = requests.Session() session.cookies['JSESSIONID'] = kwargs['jsessionid'] url = base_url + 'showDataSelector.do' params = {'reportId': '126'} rsp = session.get(url, params=params) tree = html.fromstring(rsp.content) options = tree.xpath('//select[@id="bnfVersion"]/option') year_to_bnf_version = {} for option in options: datestamp, version = option.text.split(' : ') date = datetime.datetime.strptime(datestamp, '%d-%m-%Y') year_to_bnf_version[date.year] = version year = max(year_to_bnf_version) version = year_to_bnf_version[year] url = base_url + 'requestSelectedDownload.do' params = { 'bnfVersion': version, 'filePath': '', 'dataView': '260', 'format': '', 'defaultReportIdDataSel': '', 'reportId': '126', 'action': 'checkForAvailableDownload', } rsp = session.get(url, params=params) request_id = rsp.json()['requestNo'] url = base_url + 'downloadAvailableReport.zip' params = { 'requestId': request_id, } rsp = session.post(url, params=params, stream=True) total_size = int(rsp.headers['content-length']) with open(zip_path, 'wb') as f: tqdm_iterator = tqdm( rsp.iter_content(32 * 1024), total=total_size, unit='B', unit_scale=True ) for block in tqdm_iterator: f.write(block) with zipfile.ZipFile(zip_path) as zf: zf.extractall(dir_path) csv_paths = glob.glob(os.path.join(dir_path, '*.csv')) assert len(csv_paths) == 1 os.rename(csv_paths[0], os.path.join(dir_path, 'bnf_codes.csv'))
def handle(self, *args, **kwargs): if kwargs["jsessionid"] is None: # Note that this is mostly-duplicated above, but I can't see a nice # way of avoiding this. print(""" The files are on a site that requires you to log in. To download the files, you will need to visit the site in your browser and log in. This will set a cookie in your browser which you will need to pass to this command. Specifically, you should: * Visit https://isd.digital.nhs.uk/trud3/user/authenticated/group/0/pack/6/subpack/24/releases in your browser * Sign up or log in * Copy the value of the JSESSIONID cookie * In Chrome, this can be found in the Application tab of Developer Tools * Paste this value below: """).strip() jsessionid = input() else: jsessionid = kwargs["jsessionid"] year = kwargs["year"] month = kwargs["month"] year_and_month = datetime.date(year, month, 1).strftime("%Y_%m") dir_path = os.path.join(settings.PIPELINE_DATA_BASEDIR, "dmd", year_and_month) zip_path = os.path.join(dir_path, "download.zip") if os.path.exists(dir_path): print("Data already downloaded for", year_and_month) return mkdir_p(dir_path) session = requests.Session() session.cookies["JSESSIONID"] = jsessionid base_url = "https://isd.digital.nhs.uk/" rsp = session.get( base_url + "trud3/user/authenticated/group/0/pack/6/subpack/24/releases") tree = html.fromstring(rsp.content) divs = tree.find_class("release subscribed") div_dates = [extract_date(div) for div in divs] assert div_dates == sorted(div_dates, reverse=True) divs_for_month = [] for div in divs: date = extract_date(div) if date.year == year and date.month == month: divs_for_month.append(div) if not divs_for_month: raise CommandError div = divs_for_month[-1] href = div.find_class("download-release")[0].attrib["href"] rsp = session.get(base_url + href, stream=True) with open(zip_path, "wb") as f: for block in rsp.iter_content(32 * 1024): f.write(block)