def backup_table(self, table_name): client = Client("hscic") sql = "SELECT max(month) FROM {hscic}.%s" % table_name latest_date = client.query(sql).rows[0][0] latest_year_and_month = latest_date.strftime("%Y_%m") table = client.get_table(table_name) storage_client = StorageClient() bucket = storage_client.bucket() year_and_months = set() prefix_base = "backups/{}/".format(table_name) for blob in bucket.list_blobs(prefix=prefix_base): match = re.search("/(\d{4}_\d{2})/", blob.name) year_and_months.add(match.groups()[0]) if latest_year_and_month in year_and_months: print("{} table already backed up for {}".format( table_name, latest_year_and_month)) return storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month, table_name) exporter = TableExporter(table, storage_prefix) exporter.export_to_storage()
def test_data_is_aggregated(self): # there are 11 rows in the input file; 2 are for the same # practice/presentation and should be collapsed, and 1 is for # an UNKNONWN SURGERY (see issue #349) raw_data_path = ("frontend/tests/fixtures/commands/" + "convert_hscic_prescribing/2016_01/" + "EPD_201601.csv") gcs_path = "hscic/prescribing_v2/2016_01/EPD_201601.csv" client = StorageClient() bucket = client.get_bucket() blob = bucket.blob(gcs_path) with open(raw_data_path, "rb") as f: blob.upload_from_file(f) call_command("convert_hscic_prescribing", filename=raw_data_path) # Test that data added to prescribing table client = BQClient() sql = """SELECT * FROM {hscic}.prescribing_v2 WHERE month = TIMESTAMP('2016-01-01')""" rows = list(results_to_dicts(client.query(sql))) self.assertEqual(len(rows), 9) for row in rows: if row["practice"] == "P92042" and row[ "bnf_code"] == "0202010B0AAABAB": self.assertEqual(row["quantity"], 1288)
def tearDown(self): client = Client('test') client.delete_dataset() client = StorageClient() bucket = client.bucket() for blob in bucket.list_blobs(prefix=self.storage_prefix): blob.delete()
def setUpClass(cls): if 'SKIP_BQ_LOAD' not in os.environ: # Create local test data from fixtures, then upload this to a # test project in bigquery call_command('loaddata', 'frontend/tests/fixtures/ccgs.json', verbosity=0) call_command('loaddata', 'frontend/tests/fixtures/practices.json', verbosity=0) call_command('loaddata', 'frontend/tests/fixtures/practice_listsizes.json', verbosity=0) prescribing_fixture_path = os.path.join( 'frontend', 'tests', 'fixtures', 'commands', 'prescribing_bigquery_views_fixture.csv' ) client = Client('hscic') for table_name in [ 'normalised_prescribing_standard', 'normalised_prescribing_legacy']: table = client.get_or_create_table( table_name, PRESCRIBING_SCHEMA ) table.insert_rows_from_csv(prescribing_fixture_path) table = client.get_or_create_table('ccgs', CCG_SCHEMA) columns = [field.name for field in CCG_SCHEMA] table.insert_rows_from_pg(PCT, columns, ccgs_transform) table = client.get_or_create_table( 'practice_statistics', PRACTICE_STATISTICS_SCHEMA ) columns = [field.name for field in PRACTICE_STATISTICS_SCHEMA] columns[0] = 'date' columns[-1] = 'practice_id' table.insert_rows_from_pg( PracticeStatistics, columns, statistics_transform ) client = StorageClient() bucket = client.get_bucket() for blob in bucket.list_blobs(prefix='hscic/views/vw__'): blob.delete() ImportLog.objects.create( category='prescribing', current_at='2015-10-01') # Create view tables and indexes with open( 'frontend/management/commands/replace_matviews.sql', 'r') as f: with connection.cursor() as c: c.execute(f.read())
def test_existing_files_deleted(self): # Create a dataset fragment which should end up being deleted client = StorageClient() bucket = client.get_bucket() blob_name = ('hscic/views/vw__presentation_summary_by_ccg' '-000000009999.csv.gz') blob = bucket.blob(blob_name) blob.upload_from_string("test", content_type="application/gzip") # Run import command call_command('create_views') # Check the bucket is no longer there client = StorageClient() bucket = client.get_bucket() prefix, suffix = blob_name.split('-') for blob in bucket.list_blobs(prefix=prefix): self.assertNotIn(suffix, blob.path)
def update_bnf_table(): """Update `bnf` table from cloud-stored CSV""" storage_client = StorageClient() bucket = storage_client.get_bucket() blobs = bucket.list_blobs(prefix="hscic/bnf_codes/") blobs = sorted(blobs, key=lambda blob: blob.name, reverse=True) blob = blobs[0] bq_client = BQClient("hscic") table = bq_client.get_table("bnf") table.insert_rows_from_storage(blob.name, skip_leading_rows=1)
def upload_task_input_files(task): storage_client = StorageClient() bucket = storage_client.get_bucket() for path in task.input_paths(): assert path[0] == '/' assert settings.PIPELINE_DATA_BASEDIR[-1] == '/' name = 'hscic' + path.replace(settings.PIPELINE_DATA_BASEDIR, '/') blob = bucket.blob(name) if blob.exists(): print("Skipping %s, already uploaded" % name) continue print("Uploading %s to %s" % (path, name)) with open(path) as f: blob.upload_from_file(f)
def test_data_is_aggregated(self): # there are 11 rows in the input file; 2 are for the same # practice/presentation and should be collapsed, and 1 is for # an UNKNONWN SURGERY (see issue #349) raw_data_path = 'frontend/tests/fixtures/commands/' +\ 'convert_hscic_prescribing/2016_01/' +\ 'Detailed_Prescribing_Information.csv' converted_data_path = 'frontend/tests/fixtures/commands/' +\ 'convert_hscic_prescribing/2016_01/' +\ 'Detailed_Prescribing_Information_formatted.CSV' gcs_path = 'hscic/prescribing/2016_01/' +\ 'Detailed_Prescribing_Information.csv' client = StorageClient() bucket = client.get_bucket() blob = bucket.blob(gcs_path) with open(raw_data_path) as f: blob.upload_from_file(f) call_command('convert_hscic_prescribing', filename=raw_data_path) # Test that data added to prescribing table client = BQClient() sql = '''SELECT * FROM {hscic}.prescribing WHERE month = TIMESTAMP('2016-01-01')''' rows = list(results_to_dicts(client.query(sql))) self.assertEqual(len(rows), 9) for row in rows: if row['practice'] == 'P92042' and \ row['bnf_code'] == '0202010B0AAABAB': self.assertEqual(row['quantity'], 1288) # Test that downloaded data is correct with open(converted_data_path) as f: rows = list(csv.reader(f)) self.assertEqual(len(rows), 9) for row in rows: if row[1] == 'P92042' and row[2] == '0202010B0AAABAB': self.assertEqual(row[6], '1288')
def create_storage_backed_table(self, table_id, schema, gcs_path): gcs_client = StorageClient() bucket = gcs_client.bucket() if bucket.get_blob(gcs_path) is None: raise RuntimeError("Could not find blob at {}".format(gcs_path)) gcs_uri = "gs://{}/{}".format(self.project, gcs_path) schema_as_dict = [{ "name": s.name, "type": s.field_type.lower() } for s in schema] resource = { "tableReference": { "tableId": table_id }, "externalDataConfiguration": { "csvOptions": { "skipLeadingRows": "1" }, "sourceFormat": "CSV", "sourceUris": [gcs_uri], "schema": { "fields": schema_as_dict }, }, } path = "/projects/{}/datasets/{}/tables".format( self.project, self.dataset_id) try: self.gcbq_client._connection.api_request(method="POST", path=path, data=resource) except NotFound as e: if not dataset_is_missing(e): raise self.create_dataset() self.gcbq_client._connection.api_request(method="POST", path=path, data=resource) return self.get_table(table_id)
def create_storage_backed_table(self, table_id, schema, gcs_path): gcs_client = StorageClient() bucket = gcs_client.bucket() if bucket.get_blob(gcs_path) is None: raise RuntimeError('Could not find blob at {}'.format(gcs_path)) gcs_uri = 'gs://{}/{}'.format(self.project, gcs_path) schema_as_dict = [{'name': s.name, 'type': s.field_type.lower()} for s in schema] resource = { 'tableReference': {'tableId': table_id}, 'externalDataConfiguration': { 'csvOptions': {'skipLeadingRows': '1'}, 'sourceFormat': 'CSV', 'sourceUris': [gcs_uri], 'schema': {'fields': schema_as_dict} } } path = '/projects/{}/datasets/{}/tables'.format( self.project, self.dataset_id ) try: self.gcbq_client._connection.api_request( method='POST', path=path, data=resource ) except NotFound as e: if not dataset_is_missing(e): raise self.create_dataset() self.gcbq_client._connection.api_request( method='POST', path=path, data=resource ) return self.get_table(table_id)
def run_end_to_end(): print('BQ_NONCE: {}'.format(settings.BQ_NONCE)) call_command('migrate') path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'measure_definitions') # No MeasureGlobals or MeasureValues are generated for the ghost branded # generics measure, because both numerator and denominator are computed # from a view (vw__ghost_generic_measure) which has no data. Rather than # populate this view, it is simpler to pretend it doesn't exist. num_measures = len(os.listdir(path)) - 1 shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True) with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f: f.write('{}') for blob in StorageClient().bucket().list_blobs(): blob.delete() for dataset_key in DATASETS: BQClient(dataset_key).create_dataset() client = BQClient('hscic') client.create_table('bnf', schemas.BNF_SCHEMA) client.create_table('ccgs', schemas.CCG_SCHEMA) client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA) client.create_table( 'practice_statistics', schemas.PRACTICE_STATISTICS_SCHEMA ) client.create_table( 'practice_statistics_all_years', schemas.PRACTICE_STATISTICS_SCHEMA ) client.create_table('practices', schemas.PRACTICE_SCHEMA) client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA) client.create_table('presentation', schemas.PRESENTATION_SCHEMA) client.create_table('tariff', schemas.TARIFF_SCHEMA) client.create_table('bdz_adq', schemas.BDZ_ADQ_SCHEMA) client = BQClient('measures') # This is enough of a schema to allow the practice_data_all_low_priority # table to be created, since it references these fields. Once populated by # import_measures, the tables in the measures dataset will have several # more fields. But we don't need to specify exactly what they are, as BQ # will work it out when the data is inserted with insert_rows_from_query. measures_schema = build_schema( ('month', 'DATE'), ('practice_id', 'STRING'), ('numerator', 'INTEGER'), ('denominator', 'INTEGER'), ) path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'measure_definitions', '*.json') for path in glob.glob(path): measure_id = os.path.splitext(os.path.basename(path))[0] client.create_table('practice_data_' + measure_id, measures_schema) client.create_table('ccg_data_' + measure_id, measures_schema) client.create_table('global_data_' + measure_id, measures_schema) # Although there are no model instances, we call upload_model to create the # tables in BQ that might be required by certain measure views. client = BQClient('dmd') client.upload_model(TariffPrice) client.upload_model(VMPP) call_command('generate_presentation_replacements') path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'replace_matviews.sql') with open(path) as f: with connection.cursor() as c: c.execute(f.read()) copy_tree( os.path.join(e2e_path, 'data-1'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 9, under_test=True) # We expect one MeasureGlobal per measure per month. assert_count_equal(num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams). assert_count_equal(10 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(2, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(4, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(4, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(1, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'") copy_tree( os.path.join(e2e_path, 'data-2'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 10, under_test=True) # We expect one MeasureGlobal per measure per month assert_count_equal(2 * num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month assert_count_equal(20 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(4, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(8, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(8, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(2, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'")
def __init__(self, table, storage_prefix): self.table = table self.storage_prefix = storage_prefix storage_client = StorageClient() self.bucket = storage_client.bucket()
def setUpClass(cls): if 'SKIP_BQ_LOAD' not in os.environ: # Create local test data from fixtures, then upload this to a # test project in bigquery call_command('loaddata', 'frontend/tests/fixtures/orgs.json', verbosity=0) call_command('loaddata', 'frontend/tests/fixtures/practices.json', verbosity=0) call_command('loaddata', 'frontend/tests/fixtures/practice_listsizes.json', verbosity=0) client = Client('hscic') table = client.get_or_create_table('prescribing', PRESCRIBING_SCHEMA) prescribing_fixture_path = os.path.join( 'frontend', 'tests', 'fixtures', 'commands', 'prescribing_bigquery_views_fixture.csv') table.insert_rows_from_csv(prescribing_fixture_path) table = client.get_or_create_table('ccgs', CCG_SCHEMA) columns = [field.name for field in CCG_SCHEMA] table.insert_rows_from_pg(PCT, columns, ccgs_transform) table = client.get_or_create_table('practices', PRACTICE_SCHEMA) columns = [field.name for field in PRACTICE_SCHEMA] table.insert_rows_from_pg(Practice, columns) table = client.get_or_create_table('practice_statistics', PRACTICE_STATISTICS_SCHEMA) columns = [field.name for field in PRACTICE_STATISTICS_SCHEMA] columns[0] = 'date' columns[-1] = 'practice_id' table.insert_rows_from_pg(PracticeStatistics, columns, statistics_transform) sql = """ SELECT prescribing.sha AS sha, practices.ccg_id AS pct, prescribing.practice AS practice, prescribing.bnf_code AS bnf_code, prescribing.bnf_name AS bnf_name, prescribing.items AS items, prescribing.net_cost AS net_cost, prescribing.actual_cost AS actual_cost, prescribing.quantity AS quantity, prescribing.month AS month FROM {project}.{hscic}.prescribing AS prescribing INNER JOIN {project}.{hscic}.practices AS practices ON practices.code = prescribing.practice """ try: client.delete_table('normalised_prescribing_standard') except NotFound: pass client.create_table_with_view('normalised_prescribing_standard', sql, False) client = StorageClient() bucket = client.get_bucket() for blob in bucket.list_blobs(prefix='hscic/views/vw__'): blob.delete() else: assert 'BQ_NONCE' in os.environ, 'Set BQ_NONCE to reuse BQ data' ImportLog.objects.create(category='prescribing', current_at='2015-10-01') # Create view tables and indexes with open('frontend/management/commands/replace_matviews.sql') as f: with connection.cursor() as c: c.execute(f.read())
def run_end_to_end(): print('BQ_NONCE: {}'.format(settings.BQ_NONCE)) num_measures = 57 shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True) with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f: f.write('{}') for blob in StorageClient().bucket().list_blobs(): blob.delete() for dataset_key in DATASETS: BQClient(dataset_key).create_dataset() client = BQClient('hscic') client.create_table('bnf', schemas.BNF_SCHEMA) client.create_table('ccgs', schemas.CCG_SCHEMA) client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA) client.create_table('practice_statistics', schemas.PRACTICE_STATISTICS_SCHEMA) client.create_table('practice_statistics_all_years', schemas.PRACTICE_STATISTICS_SCHEMA) client.create_table('practices', schemas.PRACTICE_SCHEMA) client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA) client.create_table('presentation', schemas.PRESENTATION_SCHEMA) client.create_table('tariff', schemas.TARIFF_SCHEMA) client = BQClient('measures') # This is enough of a schema to allow the practice_data_all_low_priority # table to be created, since it references these fields. Once populated by # import_measures, the tables in the measures dataset will have several # more fields. But we don't need to specify exactly what they are, as BQ # will work it out when the data is inserted with insert_rows_from_query. measures_schema = build_schema( ('month', 'DATE'), ('practice_id', 'STRING'), ('numerator', 'INTEGER'), ('denominator', 'INTEGER'), ) path = os.path.join(settings.SITE_ROOT, 'frontend', 'management', 'commands', 'measure_definitions', '*.json') for path in glob.glob(path): measure_id = os.path.splitext(os.path.basename(path))[0] client.create_table('practice_data_' + measure_id, measures_schema) client.create_table('ccg_data_' + measure_id, measures_schema) client.create_table('global_data_' + measure_id, measures_schema) call_command('generate_presentation_replacements') path = os.path.join(settings.SITE_ROOT, 'frontend', 'management', 'commands', 'replace_matviews.sql') with open(path) as f: with connection.cursor() as c: c.execute(f.read()) copy_tree( os.path.join(e2e_path, 'data-1'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 9, under_test=True) # We expect one MeasureGlobal per measure per month. If this assert fails, # check that num_measures is still correct. assert_count_equal(num_measures, MeasureGlobal) # We expect one MeasureValue for each CCG or Practice per measure per month assert_count_equal(6 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(2, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(4, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(4, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(1, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'") copy_tree( os.path.join(e2e_path, 'data-2'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 10, under_test=True) # We expect one MeasureGlobal per measure per month assert_count_equal(2 * num_measures, MeasureGlobal) # We expect one MeasureValue for each CCG or Practice per measure per month assert_count_equal(12 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(4, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(8, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(8, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(2, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'")
def run_end_to_end(): print('BQ_NONCE: {}'.format(settings.BQ_NONCE)) num_measures = 56 shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True) with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f: f.write('{}') for blob in StorageClient().bucket().list_blobs(): blob.delete() for dataset_key in DATASETS: BQClient(dataset_key).create_dataset() client = BQClient('hscic') client.create_table('bnf', schemas.BNF_SCHEMA) client.create_table('ccgs', schemas.CCG_SCHEMA) client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA) client.create_table( 'practice_statistics', schemas.PRACTICE_STATISTICS_SCHEMA ) client.create_table( 'practice_statistics_all_years', schemas.PRACTICE_STATISTICS_SCHEMA ) client.create_table('practices', schemas.PRACTICE_SCHEMA) client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA) client.create_table('presentation', schemas.PRESENTATION_SCHEMA) client.create_table('tariff', schemas.TARIFF_SCHEMA) call_command('generate_presentation_replacements') path = os.path.join(settings.SITE_ROOT, 'frontend', 'management', 'commands', 'replace_matviews.sql') with open(path) as f: with connection.cursor() as c: c.execute(f.read()) copy_tree( os.path.join(e2e_path, 'data-1'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 9, under_test=True) # We expect one MeasureGlobal per measure per month. If this assert fails, # check that num_measures is still correct. assert_count_equal(num_measures, MeasureGlobal) # We expect one MeasureValue for each CCG or Practice per measure per month assert_count_equal(6 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(2, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(4, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(4, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(1, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'") copy_tree( os.path.join(e2e_path, 'data-2'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 10, under_test=True) # We expect one MeasureGlobal per measure per month assert_count_equal(2 * num_measures, MeasureGlobal) # We expect one MeasureValue for each CCG or Practice per measure per month assert_count_equal(12 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(4, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(8, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(8, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(2, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'")
def upload_to_storage(self, local_path, storage_path): client = StorageClient() bucket = client.bucket() blob = bucket.blob(storage_path) with open(local_path) as f: blob.upload_from_file(f)
def run_end_to_end(): print("BQ_NONCE: {}".format(settings.BQ_NONCE)) call_command("migrate") # No MeasureGlobals or MeasureValues are generated for the ghost branded # generics measure, because both numerator and denominator are computed # from a view (vw__ghost_generic_measure) which has no data. Rather than # populate this view, it is simpler to pretend it doesn't exist. num_measures = (len( glob.glob(os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json"))) - 1) shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True) with open(settings.PIPELINE_IMPORT_LOG_PATH, "w") as f: f.write("{}") for blob in StorageClient().bucket().list_blobs(): blob.delete() for dataset_key in DATASETS: BQClient(dataset_key).create_dataset() client = BQClient("hscic") client.create_table("bnf", schemas.BNF_SCHEMA) client.create_table("ccgs", schemas.CCG_SCHEMA) client.create_table("stps", schemas.STP_SCHEMA) client.create_table("regional_teams", schemas.REGIONAL_TEAM_SCHEMA) client.create_table("ppu_savings", schemas.PPU_SAVING_SCHEMA) client.create_table("practice_statistics", schemas.PRACTICE_STATISTICS_SCHEMA) client.create_table("practice_statistics_all_years", schemas.PRACTICE_STATISTICS_SCHEMA) client.create_table("practices", schemas.PRACTICE_SCHEMA) client.create_table("prescribing", schemas.PRESCRIBING_SCHEMA) client.create_table("presentation", schemas.PRESENTATION_SCHEMA) client.create_table("tariff", schemas.TARIFF_SCHEMA) client.create_table("bdz_adq", schemas.BDZ_ADQ_SCHEMA) client = BQClient("measures") # This is enough of a schema to allow the practice_data_all_low_priority # table to be created, since it references these fields. Once populated by # import_measures, the tables in the measures dataset will have several # more fields. But we don't need to specify exactly what they are, as BQ # will work it out when the data is inserted with insert_rows_from_query. measures_schema = build_schema( ("month", "DATE"), ("practice_id", "STRING"), ("numerator", "INTEGER"), ("denominator", "INTEGER"), ) for path in glob.glob( os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json")): measure_id = os.path.splitext(os.path.basename(path))[0] client.create_table("practice_data_" + measure_id, measures_schema) client.create_table("ccg_data_" + measure_id, measures_schema) client.create_table("global_data_" + measure_id, measures_schema) # Although there are no model instances, we call upload_model to create the # dm+d tables in BQ that are required by certain measure views. client = BQClient("dmd") for model in apps.get_app_config("dmd2").get_models(): client.upload_model(model) call_command("generate_presentation_replacements") copy_tree(os.path.join(e2e_path, "data-1"), os.path.join(e2e_path, "data")) runner.run_all(2017, 9, under_test=True) # We expect one MeasureGlobal per measure per month. assert_count_equal(num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams). assert_count_equal(10 * num_measures, MeasureValue) copy_tree(os.path.join(e2e_path, "data-2"), os.path.join(e2e_path, "data")) runner.run_all(2017, 10, under_test=True) # We expect one MeasureGlobal per measure per month assert_count_equal(2 * num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month assert_count_equal(20 * num_measures, MeasureValue)