def backup_table(self, table_name): client = Client("hscic") sql = "SELECT max(month) FROM {hscic}.%s" % table_name latest_date = client.query(sql).rows[0][0] latest_year_and_month = latest_date.strftime("%Y_%m") table = client.get_table(table_name) storage_client = StorageClient() bucket = storage_client.bucket() year_and_months = set() prefix_base = "backups/{}/".format(table_name) for blob in bucket.list_blobs(prefix=prefix_base): match = re.search("/(\d{4}_\d{2})/", blob.name) year_and_months.add(match.groups()[0]) if latest_year_and_month in year_and_months: print("{} table already backed up for {}".format( table_name, latest_year_and_month)) return storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month, table_name) exporter = TableExporter(table, storage_prefix) exporter.export_to_storage()
def test_data_is_aggregated(self): # there are 11 rows in the input file; 2 are for the same # practice/presentation and should be collapsed, and 1 is for # an UNKNONWN SURGERY (see issue #349) raw_data_path = ("frontend/tests/fixtures/commands/" + "convert_hscic_prescribing/2016_01/" + "EPD_201601.csv") gcs_path = "hscic/prescribing_v2/2016_01/EPD_201601.csv" client = StorageClient() bucket = client.get_bucket() blob = bucket.blob(gcs_path) with open(raw_data_path, "rb") as f: blob.upload_from_file(f) call_command("convert_hscic_prescribing", filename=raw_data_path) # Test that data added to prescribing table client = BQClient() sql = """SELECT * FROM {hscic}.prescribing_v2 WHERE month = TIMESTAMP('2016-01-01')""" rows = list(results_to_dicts(client.query(sql))) self.assertEqual(len(rows), 9) for row in rows: if row["practice"] == "P92042" and row[ "bnf_code"] == "0202010B0AAABAB": self.assertEqual(row["quantity"], 1288)
def update_bnf_table(): """Update `bnf` table from cloud-stored CSV""" storage_client = StorageClient() bucket = storage_client.get_bucket() blobs = bucket.list_blobs(prefix="hscic/bnf_codes/") blobs = sorted(blobs, key=lambda blob: blob.name, reverse=True) blob = blobs[0] bq_client = BQClient("hscic") table = bq_client.get_table("bnf") table.insert_rows_from_storage(blob.name, skip_leading_rows=1)
def upload_task_input_files(task): storage_client = StorageClient() bucket = storage_client.get_bucket() for path in task.input_paths(): assert path[0] == '/' assert settings.PIPELINE_DATA_BASEDIR[-1] == '/' name = 'hscic' + path.replace(settings.PIPELINE_DATA_BASEDIR, '/') blob = bucket.blob(name) if blob.exists(): print("Skipping %s, already uploaded" % name) continue print("Uploading %s to %s" % (path, name)) with open(path) as f: blob.upload_from_file(f)
def test_existing_files_deleted(self): # Create a dataset fragment which should end up being deleted client = StorageClient() bucket = client.get_bucket() blob_name = ('hscic/views/vw__presentation_summary_by_ccg' '-000000009999.csv.gz') blob = bucket.blob(blob_name) blob.upload_from_string("test", content_type="application/gzip") # Run import command call_command('create_views') # Check the bucket is no longer there client = StorageClient() bucket = client.get_bucket() prefix, suffix = blob_name.split('-') for blob in bucket.list_blobs(prefix=prefix): self.assertNotIn(suffix, blob.path)
def test_data_is_aggregated(self): # there are 11 rows in the input file; 2 are for the same # practice/presentation and should be collapsed, and 1 is for # an UNKNONWN SURGERY (see issue #349) raw_data_path = 'frontend/tests/fixtures/commands/' +\ 'convert_hscic_prescribing/2016_01/' +\ 'Detailed_Prescribing_Information.csv' converted_data_path = 'frontend/tests/fixtures/commands/' +\ 'convert_hscic_prescribing/2016_01/' +\ 'Detailed_Prescribing_Information_formatted.CSV' gcs_path = 'hscic/prescribing/2016_01/' +\ 'Detailed_Prescribing_Information.csv' client = StorageClient() bucket = client.get_bucket() blob = bucket.blob(gcs_path) with open(raw_data_path) as f: blob.upload_from_file(f) call_command('convert_hscic_prescribing', filename=raw_data_path) # Test that data added to prescribing table client = BQClient() sql = '''SELECT * FROM {hscic}.prescribing WHERE month = TIMESTAMP('2016-01-01')''' rows = list(results_to_dicts(client.query(sql))) self.assertEqual(len(rows), 9) for row in rows: if row['practice'] == 'P92042' and \ row['bnf_code'] == '0202010B0AAABAB': self.assertEqual(row['quantity'], 1288) # Test that downloaded data is correct with open(converted_data_path) as f: rows = list(csv.reader(f)) self.assertEqual(len(rows), 9) for row in rows: if row[1] == 'P92042' and row[2] == '0202010B0AAABAB': self.assertEqual(row[6], '1288')
def create_storage_backed_table(self, table_id, schema, gcs_path): gcs_client = StorageClient() bucket = gcs_client.bucket() if bucket.get_blob(gcs_path) is None: raise RuntimeError("Could not find blob at {}".format(gcs_path)) gcs_uri = "gs://{}/{}".format(self.project, gcs_path) schema_as_dict = [{ "name": s.name, "type": s.field_type.lower() } for s in schema] resource = { "tableReference": { "tableId": table_id }, "externalDataConfiguration": { "csvOptions": { "skipLeadingRows": "1" }, "sourceFormat": "CSV", "sourceUris": [gcs_uri], "schema": { "fields": schema_as_dict }, }, } path = "/projects/{}/datasets/{}/tables".format( self.project, self.dataset_id) try: self.gcbq_client._connection.api_request(method="POST", path=path, data=resource) except NotFound as e: if not dataset_is_missing(e): raise self.create_dataset() self.gcbq_client._connection.api_request(method="POST", path=path, data=resource) return self.get_table(table_id)
def create_storage_backed_table(self, table_id, schema, gcs_path): gcs_client = StorageClient() bucket = gcs_client.bucket() if bucket.get_blob(gcs_path) is None: raise RuntimeError('Could not find blob at {}'.format(gcs_path)) gcs_uri = 'gs://{}/{}'.format(self.project, gcs_path) schema_as_dict = [{'name': s.name, 'type': s.field_type.lower()} for s in schema] resource = { 'tableReference': {'tableId': table_id}, 'externalDataConfiguration': { 'csvOptions': {'skipLeadingRows': '1'}, 'sourceFormat': 'CSV', 'sourceUris': [gcs_uri], 'schema': {'fields': schema_as_dict} } } path = '/projects/{}/datasets/{}/tables'.format( self.project, self.dataset_id ) try: self.gcbq_client._connection.api_request( method='POST', path=path, data=resource ) except NotFound as e: if not dataset_is_missing(e): raise self.create_dataset() self.gcbq_client._connection.api_request( method='POST', path=path, data=resource ) return self.get_table(table_id)
def __init__(self, table, storage_prefix): self.table = table self.storage_prefix = storage_prefix storage_client = StorageClient() self.bucket = storage_client.bucket()
def upload_to_storage(self, local_path, storage_path): client = StorageClient() bucket = client.bucket() blob = bucket.blob(storage_path) with open(local_path) as f: blob.upload_from_file(f)