def test_upload_model(self): client = Client('dmd') DMDProduct.objects.create( dmdid=327368008, bnf_code='0803042A0AAABAB', vpid=327368008, name='Bicalutamide 150mg tablets', ) DMDVmpp.objects.create( vppid=1206011000001108, nm='Bicalutamide 150mg tablets 28 tablet', vpid=327368008, ) NCSOConcession.objects.create( drug='Bicalutamide 150mg tablets', pack_size=28, price_concession_pence=499, vmpp_id=1206011000001108, date='2017-11-01', ) TariffPrice.objects.create( vmpp_id=1206011000001108, product_id=327368008, price_pence=422, tariff_category_id=11, date='2017-11-01', ) client.upload_model(DMDProduct) client.upload_model(DMDVmpp) client.upload_model(NCSOConcession) client.upload_model(TariffPrice) table = client.get_table('product') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['name'], 'Bicalutamide 150mg tablets') table = client.get_table('vmpp') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['nm'], 'Bicalutamide 150mg tablets 28 tablet') table = client.get_table('ncsoconcession') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['drug'], 'Bicalutamide 150mg tablets') table = client.get_table('tariffprice') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['price_pence'], 422)
def fill_views(self): client = Client('hscic') pool = Pool(processes=len(self.view_paths)) tables = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for path in self.view_paths: table_name = "vw__%s" % os.path.basename(path).replace('.sql', '') table = client.get_table(table_name) tables.append(table) with open(path) as f: sql = f.read() substitutions = {'this_month': prescribing_date} args = [table.table_id, sql, substitutions] pool.apply_async(query_and_export, args) pool.close() pool.join() # wait for all worker processes to exit for table in tables: self.download_and_import(table) self.log("-------------") with connection.cursor() as cursor: for view_id in self.materialized_views: # This is quite slow! up to 10 mins. cursor.execute("REFRESH MATERIALIZED VIEW %s" % view_id)
def fill_views(self): client = Client('hscic') pool = Pool(processes=len(self.view_paths)) tables = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for path in self.view_paths: table_name = "vw__%s" % os.path.basename(path).replace('.sql', '') table = client.get_table(table_name) tables.append(table) with open(path) as f: sql = f.read() substitutions = {'this_month': prescribing_date} args = [table.name, sql, substitutions] pool.apply_async(query_and_export, args) pool.close() pool.join() # wait for all worker processes to exit for table in tables: self.download_and_import(table) self.log("-------------")
def write_zero_prescribing_codes_table(level): """Given a BNF `level` (`section`, `chapter`, `paragraph`, etc), write a table in bigquery listing all such levels that have zero prescribing. Returns a bigquery Table. """ logger.info("Scanning %s to see if it has zero prescribing" % level) sql = """ SELECT bnf.%s FROM {hscic}.normalised_prescribing_standard AS prescribing RIGHT JOIN {hscic}.bnf bnf ON prescribing.bnf_code = bnf.presentation_code WHERE ( bnf.presentation_code NOT LIKE '2%%' -- appliances, etc ) GROUP BY bnf.%s HAVING COUNT(prescribing.bnf_code) = 0 """ % (level, level) client = Client('tmp_eu') table = client.get_table('unused_codes_%s' % level) table.insert_rows_from_query(sql) return table
def query_and_export(table_name, sql, substitutions): try: client = Client('hscic') table = client.get_table(table_name) storage_prefix = 'hscic/views/{}-'.format(table_name) logger.info("Generating view %s and saving to %s" % (table_name, storage_prefix)) logger.info("Running SQL for %s: %s" % (table_name, sql)) table.insert_rows_from_query(sql, substitutions=substitutions) exporter = TableExporter(table, storage_prefix) logger.info('Deleting existing data in storage at %s' % storage_prefix) exporter.delete_from_storage() logger.info('Exporting data to storage at %s' % storage_prefix) exporter.export_to_storage() logger.info("View generation complete for %s" % table_name) except Exception: # Log the formatted error, because the multiprocessing pool # this is called from only shows the error message (with no # traceback) logger.error(traceback.format_exc()) raise
def backup_table(self, table_name): client = Client("hscic") sql = "SELECT max(month) FROM {hscic}.%s" % table_name latest_date = client.query(sql).rows[0][0] latest_year_and_month = latest_date.strftime("%Y_%m") table = client.get_table(table_name) storage_client = StorageClient() bucket = storage_client.bucket() year_and_months = set() prefix_base = "backups/{}/".format(table_name) for blob in bucket.list_blobs(prefix=prefix_base): match = re.search("/(\d{4}_\d{2})/", blob.name) year_and_months.add(match.groups()[0]) if latest_year_and_month in year_and_months: print("{} table already backed up for {}".format( table_name, latest_year_and_month)) return storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month, table_name) exporter = TableExporter(table, storage_prefix) exporter.export_to_storage()
def upload_to_bq(self): client = Client("dmd") for model in apps.get_app_config("dmd").get_models(): client.upload_model(model) for model in [VMP, AMP, VMPP, AMPP]: table_id = "{}_full".format(model.obj_type) table = client.get_table(table_id) sql = self.sql_for_cls(model) table.insert_rows_from_query(sql)
def update_bnf_table(): """Update `bnf` table from cloud-stored CSV""" storage_client = StorageClient() bucket = storage_client.get_bucket() blobs = bucket.list_blobs(prefix="hscic/bnf_codes/") blobs = sorted(blobs, key=lambda blob: blob.name, reverse=True) blob = blobs[0] bq_client = BQClient("hscic") table = bq_client.get_table("bnf") table.insert_rows_from_storage(blob.name, skip_leading_rows=1)
def handle(self, *args, measure_ids, **kwargs): base_path = os.path.join(settings.APPS_ROOT, "bq_public_tables") client = Client("public") with open(os.path.join(base_path, "_measure_template.sql")) as f: measure_template_sql = f.read() if measure_ids: measures = Measure.objects.filter(id__in=measure_ids) else: measures = Measure.objects.all() for measure in measures: table_name = "measure_" + measure.id print(table_name) table = client.get_table(table_name) numerator_sql = """ SELECT CAST(month AS DATE) AS month, practice AS practice_id, {numerator_columns} FROM {numerator_from} WHERE {numerator_where} GROUP BY month, practice_id """.format( numerator_columns=measure.numerator_columns, numerator_from=measure.numerator_from, numerator_where=measure.numerator_where, ) denominator_sql = """ SELECT CAST(month AS DATE) AS month, practice AS practice_id, {denominator_columns} FROM {denominator_from} WHERE {denominator_where} GROUP BY month, practice_id """.format( denominator_columns=measure.denominator_columns, denominator_from=measure.denominator_from, denominator_where=measure.denominator_where, ) sql = partially_format( measure_template_sql, numerator_sql=numerator_sql, denominator_sql=denominator_sql, ) table.insert_rows_from_query(sql)
def handle(self, *args, **kwargs): base_path = os.path.join(settings.APPS_ROOT, "bq_public_tables") client = Client("public") for path in glob.glob(os.path.join(base_path, "*.sql")): table_name = os.path.splitext(os.path.basename(path))[0] table = client.get_table(table_name) with open(path) as f: sql = f.read() table.insert_rows_from_query(sql)
def test_upload_model(self): client = Client('dmd') client.upload_model(VMPP) table = client.get_table('vmpp') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 4) for row in rows: if row['id'] != 1079211000001106: continue for k, v in { 'invalid': False, 'nm': 'Chlortalidone 50mg tablets 28 tablet', 'vmp': 317935006, 'qtyval': Decimal('28.00'), 'qty_uom': 428673006, 'combpack': None, 'bnf_code': '0202010F0AAAAAA', }.items(): self.assertEqual(row[k], v)
def test_upload_model(self): client = Client("dmd") client.upload_model(VMPP) table = client.get_table("vmpp") rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 4) for row in rows: if row["id"] != 1079211000001106: continue for k, v in { "invalid": False, "nm": "Chlortalidone 50mg tablets 28 tablet", "vmp": 317935006, "qtyval": Decimal("28.00"), "qty_uom": 428673006, "combpack": None, "bnf_code": "0202010F0AAAAAA", }.items(): self.assertEqual(row[k], v)
def get_table(self, table_name): client = Client("measures") return client.get_table(table_name)
def handle(self, *args, **options): path = options["filename"] head, filename = os.path.split(path) converted_path = "{}_formatted.CSV".format(os.path.splitext(path)[0]) _, year_and_month = os.path.split(head) logger.info("path: %s", path) logger.info("converted_path: %s", converted_path) logger.info("year_and_month: %s", year_and_month) date = year_and_month + "_01" try: datetime.datetime.strptime(date, "%Y_%m_%d") except ValueError: message = ("The file path must have a YYYY_MM " "date component in the containing directory: ") message += path raise CommandError(message) hscic_dataset_client = Client("hscic") tmp_dataset_client = Client("tmp_eu") # Check that we haven't already processed data for this month sql = """SELECT COUNT(*) FROM {dataset}.prescribing WHERE month = TIMESTAMP('{date}')""".format( dataset=hscic_dataset_client.dataset_id, date=date.replace("_", "-")) try: results = hscic_dataset_client.query(sql) assert results.rows[0][0] == 0 except NotFound: pass # Create BQ table backed backed by uploaded source CSV file raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month) gcs_path = "hscic/prescribing/{}/{}".format(year_and_month, filename) logger.info("raw_data_table_name: %s", raw_data_table_name) logger.info("gcs_path: %s", gcs_path) raw_data_table = tmp_dataset_client.create_storage_backed_table( raw_data_table_name, RAW_PRESCRIBING_SCHEMA, gcs_path) # Append aggregated data to prescribing table sql = """ SELECT Area_Team_Code AS sha, LEFT(PCO_Code, 3) AS pct, Practice_Code AS practice, BNF_Code AS bnf_code, BNF_Description AS bnf_name, SUM(Items) AS items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, TIMESTAMP('%s') AS month, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY bnf_code, bnf_name, pct, practice, sha """ % ( date.replace("_", "-"), raw_data_table.qualified_name, ) logger.info("sql: %s", sql) prescribing_table = hscic_dataset_client.get_table("prescribing") prescribing_table.insert_rows_from_query( sql, legacy=True, write_disposition="WRITE_APPEND") # Write aggregated data to new table, for download sql = """ SELECT LEFT(PCO_Code, 3) AS pct_id, Practice_Code AS practice_code, BNF_Code AS presentation_code, SUM(Items) AS total_items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, '%s' AS processing_date, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY presentation_code, pct_id, practice_code """ % ( date, raw_data_table.qualified_name, ) fmtd_data_table_name = "formatted_prescribing_%s" % year_and_month logger.info("sql: %s", sql) logger.info("fmtd_data_table_name: %s", fmtd_data_table_name) fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name) fmtd_data_table.insert_rows_from_query(sql, legacy=True) # Export new table to storage, and download exporter = TableExporter(fmtd_data_table, gcs_path + "_formatted-") exporter.export_to_storage(print_header=False) with tempfile.NamedTemporaryFile(dir=head) as f: exporter.download_from_storage_and_unzip(f) # Sort the output. # # Why? Because this is equivalent to CLUSTERing the table on # loading, but less resource-intensive than doing it in # Postgres. And the table is too big to sort within BigQuery. subprocess.call( "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" % (f.name, converted_path), shell=True, )
def handle(self, *args, **kwargs): update_bnf_table() client = BQClient("hscic") table = client.get_table("practices") table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA) table = client.get_table("presentation") table.insert_rows_from_pg( models.Presentation, schemas.PRESENTATION_SCHEMA, transformer=schemas.presentation_transform, ) table = client.get_table("practice_statistics") columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA] columns[0] = "date" columns[-1] = "practice_id" table.insert_rows_from_pg( models.PracticeStatistics, schema=schemas.PRACTICE_STATISTICS_SCHEMA, columns=columns, transformer=schemas.statistics_transform, ) sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years" results = client.query(sql) if results.rows[0][0] is None: last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1) else: last_uploaded_practice_statistics_date = results.rows[0][0].date() table = client.get_table("practice_statistics_all_years") sql = """SELECT * FROM {hscic}.practice_statistics WHERE month > TIMESTAMP('{date}')""" substitutions = {"date": last_uploaded_practice_statistics_date} table.insert_rows_from_query(sql, write_disposition="WRITE_APPEND", substitutions=substitutions) table = client.get_table("pcns") table.insert_rows_from_pg(models.PCN, schemas.PCN_SCHEMA) table = client.get_table("ccgs") table.insert_rows_from_pg(models.PCT, schemas.CCG_SCHEMA, transformer=schemas.ccgs_transform) table = client.get_table("stps") table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA) table = client.get_table("regional_teams") table.insert_rows_from_pg(models.RegionalTeam, schemas.REGIONAL_TEAM_SCHEMA) date = models.ImportLog.objects.latest_in_category( "prescribing").current_at table = client.get_table("prescribing_" + date.strftime("%Y_%m")) sql = """SELECT * FROM {hscic}.prescribing_v2 WHERE month = TIMESTAMP('{date}')""" substitutions = {"date": date} table.insert_rows_from_query(sql, substitutions=substitutions)
def handle(self, *args, **kwargs): # Make sure that PracticeStatistics and Prescription tables both have # latest data. latest_practice_statistic_date = models.PracticeStatistics.objects.aggregate( Max("date"))["date__max"] latest_prescription_date = models.Prescription.objects.aggregate( Max("processing_date"))["processing_date__max"] if latest_practice_statistic_date != latest_prescription_date: msg = ("Latest PracticeStatistics object has date {}, " "while latest Prescription object has processing_date {}". format(latest_practice_statistic_date, latest_prescription_date)) raise CommandError(msg) date = latest_prescription_date update_bnf_table() client = BQClient("hscic") table = client.get_table("practices") table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA) table = client.get_table("presentation") table.insert_rows_from_pg( models.Presentation, schemas.PRESENTATION_SCHEMA, transformer=schemas.presentation_transform, ) table = client.get_table("practice_statistics") columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA] columns[0] = "date" columns[-1] = "practice_id" table.insert_rows_from_pg( models.PracticeStatistics, schema=schemas.PRACTICE_STATISTICS_SCHEMA, columns=columns, transformer=schemas.statistics_transform, ) sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years" results = client.query(sql) if results.rows[0][0] is None: last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1) else: last_uploaded_practice_statistics_date = results.rows[0][0].date() table = client.get_table("practice_statistics_all_years") sql = """SELECT * FROM {hscic}.practice_statistics WHERE month > TIMESTAMP('{date}')""" substitutions = {"date": last_uploaded_practice_statistics_date} table.insert_rows_from_query(sql, write_disposition="WRITE_APPEND", substitutions=substitutions) table = client.get_table("ccgs") table.insert_rows_from_pg(models.PCT, schemas.CCG_SCHEMA, transformer=schemas.ccgs_transform) table = client.get_table("stps") table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA) table = client.get_table("regional_teams") table.insert_rows_from_pg(models.RegionalTeam, schemas.REGIONAL_TEAM_SCHEMA) table = client.get_table("prescribing_" + date.strftime("%Y_%m")) sql = """SELECT * FROM {hscic}.prescribing WHERE month = TIMESTAMP('{date}')""" substitutions = {"date": date} table.insert_rows_from_query(sql, substitutions=substitutions)
def handle(self, *args, **options): path = options["filename"] head, filename = os.path.split(path) _, year_and_month = os.path.split(head) logger.info("path: %s", path) logger.info("year_and_month: %s", year_and_month) date = year_and_month + "_01" try: datetime.datetime.strptime(date, "%Y_%m_%d") except ValueError: message = ("The file path must have a YYYY_MM " "date component in the containing directory: ") message += path raise CommandError(message) hscic_dataset_client = Client("hscic") tmp_dataset_client = Client("tmp_eu") # Check that we haven't already processed data for this month sql = """SELECT COUNT(*) FROM {hscic}.prescribing_v2 WHERE month = TIMESTAMP('{date}')""" try: results = hscic_dataset_client.query( sql, substitutions={"date": date.replace("_", "-")}) assert results.rows[0][0] == 0 except NotFound: pass # Create BQ table backed backed by uploaded source CSV file raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month) gcs_path = "hscic/prescribing_v2/{}/{}".format(year_and_month, filename) logger.info("raw_data_table_name: %s", raw_data_table_name) logger.info("gcs_path: %s", gcs_path) raw_data_table = tmp_dataset_client.create_storage_backed_table( raw_data_table_name, RAW_PRESCRIBING_SCHEMA_V2, gcs_path) # Append aggregated data to prescribing table sql = """ SELECT AREA_TEAM_CODE AS sha, LEFT(PCO_CODE, 3) AS pct, PRACTICE_CODE AS practice, BNF_CODE AS bnf_code, BNF_DESCRIPTION AS bnf_name, SUM(ITEMS) AS items, SUM(NIC) AS net_cost, SUM(ACTUAL_COST) AS actual_cost, SUM(TOTAL_QUANTITY) AS quantity, TIMESTAMP('%s') AS month, FROM %s WHERE PRACTICE_CODE NOT LIKE '%%998' -- see issue #349 GROUP BY bnf_code, bnf_name, pct, practice, sha """ % ( date.replace("_", "-"), raw_data_table.qualified_name, ) logger.info("sql: %s", sql) prescribing_table = hscic_dataset_client.get_table("prescribing_v2") prescribing_table.insert_rows_from_query( sql, legacy=True, write_disposition="WRITE_APPEND") ImportLog.objects.create(current_at=date.replace("_", "-"), filename=filename, category="prescribing")
def handle(self, *args, **kwargs): # Make sure that PracticeStatistics and Prescription tables both have # latest data. latest_practice_statistic_date = models.PracticeStatistics.objects\ .aggregate(Max('date'))['date__max'] latest_prescription_date = models.Prescription.objects\ .aggregate(Max('processing_date'))['processing_date__max'] if latest_practice_statistic_date != latest_prescription_date: msg = 'Latest PracticeStatistics object has date {}, '\ 'while latest Prescription object has processing_date {}'\ .format(latest_practice_statistic_date, latest_prescription_date) raise CommandError(msg) date = latest_prescription_date update_bnf_table() client = BQClient('hscic') table = client.get_table('practices') columns = [field.name for field in schemas.PRACTICE_SCHEMA] table.insert_rows_from_pg(models.Practice, columns) table = client.get_table('presentation') columns = [field.name for field in schemas.PRESENTATION_SCHEMA] table.insert_rows_from_pg(models.Presentation, columns, schemas.presentation_transform) table = client.get_table('practice_statistics') columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA] columns[0] = 'date' columns[-1] = 'practice_id' table.insert_rows_from_pg(models.PracticeStatistics, columns, schemas.statistics_transform) sql = 'SELECT MAX(month) FROM {hscic}.practice_statistics_all_years' results = client.query(sql) if results.rows[0][0] is None: last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1) else: last_uploaded_practice_statistics_date = results.rows[0][0].date() table = client.get_table('practice_statistics_all_years') sql = '''SELECT month, pct_id, practice, male_0_4, female_0_4, male_5_14, female_5_14, male_15_24, female_15_24, male_25_34, female_25_34, male_35_44, female_35_44, male_45_54, female_45_54, male_55_64, female_55_64, male_65_74, female_65_74, male_75_plus, female_75_plus, total_list_size FROM {hscic}.practice_statistics WHERE month > TIMESTAMP('{date}')''' substitutions = {'date': last_uploaded_practice_statistics_date} table.insert_rows_from_query(sql, write_disposition='WRITE_APPEND', substitutions=substitutions) table = client.get_table('ccgs') columns = [field.name for field in schemas.CCG_SCHEMA] table.insert_rows_from_pg(models.PCT, columns, schemas.ccgs_transform) table = client.get_table('ppu_savings') columns = [field.name for field in PPU_SAVING_SCHEMA] table.insert_rows_from_pg(PPUSaving, columns, ppu_savings_transform) table = client.get_table('prescribing_' + date.strftime('%Y_%m')) sql = '''SELECT * FROM {hscic}.prescribing WHERE month = TIMESTAMP('{date}')''' substitutions = {'date': date} table.insert_rows_from_query(sql, substitutions=substitutions)
def test_the_lot(self): client = Client('test') schema = build_schema( ('a', 'INTEGER'), ('b', 'STRING'), ) headers = ['a', 'b'] rows = [ (1, 'apple'), (2, 'banana'), (3, 'coconut'), ] t1 = client.get_or_create_table('t1', schema) t1_qname = t1.qualified_name # Test Table.insert_rows_from_csv t1.insert_rows_from_csv('gcutils/tests/test_table.csv') self.assertEqual(sorted(t1.get_rows()), rows) # Test Table.insert_rows_from_query t2 = client.get_table('t2') sql = 'SELECT * FROM {} WHERE a > 1'.format(t1_qname) t2.insert_rows_from_query(sql) self.assertEqual(sorted(t2.get_rows()), rows[1:]) # Test Client.query sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname) results = client.query(sql) self.assertEqual(sorted(results.rows), rows[2:]) # Test Client.query_into_dataframe sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname) df = client.query_into_dataframe(sql) self.assertEqual(df.values.tolist(), [list(rows[2])]) # Test TableExporter.export_to_storage and # TableExporter.download_from_storage_and_unzip t1_exporter = TableExporter(t1, self.storage_prefix + 'test_table-') t1_exporter.export_to_storage() with tempfile.NamedTemporaryFile(mode='r+') as f: t1_exporter.download_from_storage_and_unzip(f) f.seek(0) reader = csv.reader(f) data = [reader.next()] + sorted(reader) self.assertEqual(data, [map(str, row) for row in [headers] + rows]) # Test Table.insert_rows_from_storage storage_path = self.storage_prefix + 'test_table.csv' self.upload_to_storage('gcutils/tests/test_table.csv', storage_path) t2.insert_rows_from_storage(storage_path) self.assertEqual(sorted(t2.get_rows()), rows) # Test Client.create_storage_backed_table storage_path = self.storage_prefix + 'test_table_headers.csv' self.upload_to_storage( 'gcutils/tests/test_table_headers.csv', storage_path ) schema = [ {'name': 'a', 'type': 'integer'}, {'name': 'b', 'type': 'string'}, ] t3 = client.create_storage_backed_table( 't3', schema, storage_path ) results = client.query('SELECT * FROM {}'.format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows) self.upload_to_storage( 'gcutils/tests/test_table_headers_2.csv', storage_path ) results = client.query('SELECT * FROM {}'.format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows + [(4, u'damson')]) # Test Client.create_table_with_view sql = 'SELECT * FROM {{project}}.{} WHERE a > 1'.format(t1_qname) t4 = client.create_table_with_view('t4', sql, False) results = client.query('SELECT * FROM {}'.format(t4.qualified_name)) self.assertEqual(sorted(results.rows), rows[1:]) # Test Client.insert_rows_from_pg PCT.objects.create(code='ABC', name='CCG 1') PCT.objects.create(code='XYZ', name='CCG 2') def transformer(row): return [ord(row[0][0]), row[1]] t1.insert_rows_from_pg(PCT, ['code', 'name'], transformer) self.assertEqual(sorted(t1.get_rows()), [(65, 'CCG 1'), (88, 'CCG 2')]) # Test Table.delete_all_rows t1.delete_all_rows() self.assertEqual(list(t1.get_rows()), [])
def test_the_lot(self): client = Client("test") archive_client = Client("archive") orig_schema = build_schema(("a", "STRING"), ("b", "INTEGER")) schema = build_schema(("a", "INTEGER"), ("b", "STRING")) headers = ["a", "b"] rows = [(1, "apple"), (2, "banana"), (3, "coconut")] t1 = client.get_or_create_table("t1", orig_schema) t1_qname = t1.qualified_name # Test Table.insert_rows_from_csv t1.insert_rows_from_csv("gcutils/tests/test_table.csv", schema) self.assertEqual(sorted(t1.get_rows()), rows) # Test Table.insert_rows_from_query t2 = client.get_table("t2") sql = "SELECT * FROM {} WHERE a > 1".format(t1_qname) t2.insert_rows_from_query(sql) self.assertEqual(sorted(t2.get_rows()), rows[1:]) # Test Client.query sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname) results = client.query(sql) self.assertEqual(sorted(results.rows), rows[2:]) # Test Client.query_into_dataframe sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname) df = client.query_into_dataframe(sql) self.assertEqual(df.values.tolist(), [list(rows[2])]) # Test TableExporter.export_to_storage and # TableExporter.download_from_storage_and_unzip t1_exporter = TableExporter(t1, self.storage_prefix + "test_table-") t1_exporter.export_to_storage() with tempfile.NamedTemporaryFile(mode="r+") as f: t1_exporter.download_from_storage_and_unzip(f) f.seek(0) reader = csv.reader(f) data = [next(reader)] + sorted(reader) self.assertEqual(data, [list(map(str, row)) for row in [headers] + rows]) # Test Table.insert_rows_from_storage storage_path = self.storage_prefix + "test_table.csv" self.upload_to_storage("gcutils/tests/test_table.csv", storage_path) t2.insert_rows_from_storage(storage_path) self.assertEqual(sorted(t2.get_rows()), rows) # Test Client.create_storage_backed_table storage_path = self.storage_prefix + "test_table_headers.csv" self.upload_to_storage("gcutils/tests/test_table_headers.csv", storage_path) schema = build_schema(("a", "INTEGER"), ("b", "STRING")) t3 = client.create_storage_backed_table("t3", schema, storage_path) results = client.query("SELECT * FROM {}".format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows) self.upload_to_storage("gcutils/tests/test_table_headers_2.csv", storage_path) results = client.query("SELECT * FROM {}".format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows + [(4, "damson")]) # Test Client.create_table_with_view sql = "SELECT * FROM {{project}}.{} WHERE a > 1".format(t1_qname) t4 = client.create_table_with_view("t4", sql, False) results = client.query("SELECT * FROM {}".format(t4.qualified_name)) self.assertEqual(sorted(results.rows), rows[1:]) # Test Table.copy_to_new_dataset t1.copy_to_new_dataset("archive") t1_archived = archive_client.get_table("t1") self.assertEqual(sorted(t1_archived.get_rows()), rows) self.assertEqual(sorted(t1.get_rows()), rows) # Test Table.move_to_new_dataset t2.move_to_new_dataset("archive") t2_archived = archive_client.get_table("t2") self.assertEqual(sorted(t2_archived.get_rows()), rows) with self.assertRaises(NotFound): list(t2.get_rows()) # Test Client.insert_rows_from_pg PCT.objects.create(code="ABC", name="CCG 1") PCT.objects.create(code="XYZ", name="CCG 2") def transformer(row): return [ord(row[0][0]), row[1]] t1.insert_rows_from_pg( PCT, build_schema(("code", "INTEGER"), ("name", "STRING")), transformer=transformer, ) self.assertEqual(sorted(t1.get_rows()), [(65, "CCG 1"), (88, "CCG 2")]) # Test Table.delete_all_rows t1.delete_all_rows() self.assertEqual(list(t1.get_rows()), [])
def make_merged_table_for_month(month): """Create a new BigQuery table that includes code substitutions, off which our savings can be computed. What are code substitutions? Because (for example) Tramadol tablets and capsules can almost always be substituted, we consider them the same chemical for the purposes of our analysis. Therefore, wherever Tramadol capsules appear in the source data, we treat them as Tramadol tablets (for example). The mapping of what we consider equivalent is stored in a Google Sheet, currently at https://docs.google.com/spreadsheets/d/1SvMGCKrmqsNkZYuGW18Sf0wTluXyV4bhyZQaVLcO41c/edit The process of updating this spreadsheet (which should be done periodically) is documented [here](https://github.com/ebmdatalab/price-per-dose/issues/11) """ cases = [] seen = set() df = pd.read_csv(SUBSTITUTIONS_SPREADSHEET) df = df[df['Really equivalent?'] == 'Y'] for row in df.iterrows(): data = row[1] source_code = data[1].strip() code_to_merge = data[10].strip() if source_code not in seen and code_to_merge not in seen: cases.append((code_to_merge, source_code)) seen.add(source_code) seen.add(code_to_merge) prescribing_table = 'normalised_prescribing_standard' sql = """ SELECT practice, pct, CASE bnf_code %s ELSE bnf_code END AS bnf_code, month, actual_cost, net_cost, quantity FROM {hscic}.%s WHERE month = TIMESTAMP('%s') """ % (' '.join([ "WHEN '%s' THEN '%s'" % (when_code, then_code) for (when_code, then_code) in cases ]), prescribing_table, month) target_table_name = ('prescribing_with_merged_codes_%s' % month.strftime('%Y_%m')) client = Client('hscic') table = client.get_table(target_table_name) table.insert_rows_from_query(sql) return target_table_name
def handle(self, *args, **options): path = options['filename'] head, filename = os.path.split(path) converted_path = '{}_formatted.CSV'.format(os.path.splitext(path)[0]) _, year_and_month = os.path.split(head) logger.info('path: %s', path) logger.info('converted_path: %s', converted_path) logger.info('year_and_month: %s', year_and_month) date = year_and_month + '_01' try: datetime.datetime.strptime(date, '%Y_%m_%d') except ValueError: message = ('The file path must have a YYYY_MM ' 'date component in the containing directory: ') message += path raise CommandError(message) hscic_dataset_client = Client('hscic') tmp_dataset_client = Client('tmp_eu') # Check that we haven't already processed data for this month sql = '''SELECT COUNT(*) FROM {dataset}.prescribing WHERE month = TIMESTAMP('{date}')'''.format( dataset=hscic_dataset_client.dataset_id, date=date.replace('_', '-'), ) try: results = hscic_dataset_client.query(sql) assert results.rows[0][0] == 0 except NotFound: pass # Create BQ table backed backed by uploaded source CSV file raw_data_table_name = 'raw_prescribing_data_{}'.format(year_and_month) gcs_path = 'hscic/prescribing/{}/{}'.format(year_and_month, filename) logger.info('raw_data_table_name: %s', raw_data_table_name) logger.info('gcs_path: %s', gcs_path) schema = [ {'name': 'Regional_Office_Name', 'type': 'string'}, {'name': 'Regional_Office_Code', 'type': 'string'}, {'name': 'Area_Team_Name', 'type': 'string'}, {'name': 'Area_Team_Code', 'type': 'string', 'mode': 'required'}, {'name': 'PCO_Name', 'type': 'string'}, {'name': 'PCO_Code', 'type': 'string'}, {'name': 'Practice_Name', 'type': 'string'}, {'name': 'Practice_Code', 'type': 'string', 'mode': 'required'}, {'name': 'BNF_Code', 'type': 'string', 'mode': 'required'}, {'name': 'BNF_Description', 'type': 'string', 'mode': 'required'}, {'name': 'Items', 'type': 'integer', 'mode': 'required'}, {'name': 'Quantity', 'type': 'integer', 'mode': 'required'}, {'name': 'ADQ_Usage', 'type': 'float'}, {'name': 'NIC', 'type': 'float', 'mode': 'required'}, {'name': 'Actual_Cost', 'type': 'float', 'mode': 'required'}, ] raw_data_table = tmp_dataset_client.create_storage_backed_table( raw_data_table_name, schema, gcs_path ) # Append aggregated data to prescribing table sql = ''' SELECT Area_Team_Code AS sha, LEFT(PCO_Code, 3) AS pct, Practice_Code AS practice, BNF_Code AS bnf_code, BNF_Description AS bnf_name, SUM(Items) AS items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, TIMESTAMP('%s') AS month, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY bnf_code, bnf_name, pct, practice, sha ''' % (date.replace('_', '-'), raw_data_table.qualified_name) logger.info('sql: %s', sql) prescribing_table = hscic_dataset_client.get_table('prescribing') prescribing_table.insert_rows_from_query( sql, legacy=True, write_disposition='WRITE_APPEND' ) # Write aggregated data to new table, for download sql = ''' SELECT LEFT(PCO_Code, 3) AS pct_id, Practice_Code AS practice_code, BNF_Code AS presentation_code, SUM(Items) AS total_items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, '%s' AS processing_date, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY presentation_code, pct_id, practice_code ''' % (date, raw_data_table.qualified_name) fmtd_data_table_name = 'formatted_prescribing_%s' % year_and_month logger.info('sql: %s', sql) logger.info('fmtd_data_table_name: %s', fmtd_data_table_name) fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name) fmtd_data_table.insert_rows_from_query(sql, legacy=True) # Export new table to storage, and download exporter = TableExporter(fmtd_data_table, gcs_path + '_formatted-') exporter.export_to_storage(print_header=False) with tempfile.NamedTemporaryFile(dir=head) as f: exporter.download_from_storage_and_unzip(f) # Sort the output. # # Why? Because this is equivalent to CLUSTERing the table on # loading, but less resource-intensive than doing it in # Postgres. And the table is too big to sort within BigQuery. subprocess.call( "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" % ( f.name, converted_path), shell=True)