def get_image(album_name, image_name, username): gallery_db = connect_to_db() albums = gallery_db.albums requested_album = albums.find_one({"name": album_name}) if not requested_album: return redirect(url_for('static', filename='image_not_found.gif')) if not (username in requested_album["write"] or username in requested_album["read"]): return redirect(url_for('static', filename='image_not_found.gif')) if image_name not in requested_album["images"]: return redirect(url_for('static', filename='image_not_found.gif')) try: stats_download_timer = stats_client.timer("download timer") # start to time the download stats_download_timer.start() blob_service = BlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY) data = blob_service.get_blob_to_bytes(CONTAINER_NAME, image_name) response = make_response(data) response.headers["Content-Disposition"] = "filename=%s.jpg" % image_name response.headers['Content-type'] = 'image/jpeg' stats_download_timer.stop() stats_client.incr("images downloaded", 1) return response except Exception as ex: # TODO: different image in this case? stats_download_timer.stop() return redirect(url_for('static', filename='image_not_found.gif'))
class AzureTransfer(BaseTransfer): def __init__(self, account_name, account_key, container_name): BaseTransfer.__init__(self) self.account_name = account_name self.account_key = account_key self.container_name = container_name self.conn = BlobService(account_name=self.account_name, account_key=self.account_key) self.container = self.get_or_create_container(self.container_name) self.log.debug("AzureTransfer initialized") def get_metadata_for_key(self, key): key = fix_path(key) return self.list_path(key)[0]['metadata'] def list_path(self, path): return_list = [] path = fix_path(path) self.log.info("Asking for listing of: %r", path) for r in self.conn.list_blobs(self.container_name, prefix=path, delimiter="/", include="metadata"): entry = {"name": r.name, "size": r.properties.content_length, "last_modified": dateutil.parser.parse(r.properties.last_modified), "metadata": r.metadata} return_list.append(entry) return return_list def delete_key(self, key_name): key_name = fix_path(key_name) self.log.debug("Deleting key: %r", key_name) return self.conn.delete_blob(self.container_name, key_name) def get_contents_to_file(self, obj_key, filepath_to_store_to): obj_key = fix_path(obj_key) self.log.debug("Starting to fetch the contents of: %r to: %r", obj_key, filepath_to_store_to) return self.conn.get_blob_to_path(self.container_name, obj_key, filepath_to_store_to) def get_contents_to_string(self, obj_key): obj_key = fix_path(obj_key) self.log.debug("Starting to fetch the contents of: %r", obj_key) return self.conn.get_blob_to_bytes(self.container_name, obj_key), self.get_metadata_for_key(obj_key) def store_file_from_memory(self, key, memstring, metadata=None): # For whatever reason Azure requires all values to be strings at the point of sending metadata_to_send = dict((str(k), str(v)) for k, v in metadata.items()) self.conn.put_block_blob_from_bytes(self.container_name, key, memstring, x_ms_meta_name_values=metadata_to_send) def store_file_from_disk(self, key, filepath, metadata=None): # For whatever reason Azure requires all values to be strings at the point of sending metadata_to_send = dict((str(k), str(v)) for k, v in metadata.items()) self.conn.put_block_blob_from_path(self.container_name, key, filepath, x_ms_meta_name_values=metadata_to_send) def get_or_create_container(self, container_name): start_time = time.time() self.conn.create_container(container_name) self.log.debug("Got/Created container: %r successfully, took: %.3fs", container_name, time.time() - start_time) return container_name
def download_chunck(path, storagename, container, key): blob_service = BlobService(account_name=storagename, account_key=key) loop = 0 while True: try: return blob_service.get_blob_to_bytes(container, path) except (azure.http.HTTPError, TimeoutError) as e: loop = loop + 1 if loop >= 3: raise e
class PerformanceTests(unittest.TestCase): def setUp(self): self.workspace = Workspace(settings.workspace.id, settings.workspace.token, settings.workspace.endpoint) self.blob = BlobService(settings.storage.account_name, settings.storage.account_key) def _write_blob_contents(self, filename, data): if settings.diagnostics.write_blob_contents: with open('original-blob-' + filename, 'wb') as data_file: data_file.write(data) def _write_serialized_frame(self, filename, data): if settings.diagnostics.write_serialized_frame: with open('serialized-frame-' + filename, 'wb') as data_file: data_file.write(data) def test_serialize_40mb_dataframe(self): # Arrange blob_name = settings.storage.medium_size_blob original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) original_dataframe = pd.read_csv(BytesIO(original_data), header=0, sep=",", encoding='utf-8-sig') self._write_blob_contents(blob_name, original_data) # Act start_time = datetime.now() writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe) elapsed_time = datetime.now() - start_time result_data = writer.getvalue() self._write_serialized_frame(blob_name, result_data) # Assert result_dataframe = pd.read_csv(BytesIO(result_data), header=0, sep=",", encoding='utf-8-sig') assert_frame_equal(original_dataframe, result_dataframe) self.assertLess(elapsed_time.total_seconds(), 10)
class PerformanceTests(unittest.TestCase): def setUp(self): self.workspace = Workspace( settings.workspace.id, settings.workspace.token, settings.workspace.endpoint ) self.blob = BlobService( settings.storage.account_name, settings.storage.account_key ) def _write_blob_contents(self, filename, data): if settings.diagnostics.write_blob_contents: with open('original-blob-' + filename, 'wb') as data_file: data_file.write(data) def _write_serialized_frame(self, filename, data): if settings.diagnostics.write_serialized_frame: with open('serialized-frame-' + filename, 'wb') as data_file: data_file.write(data) def test_serialize_40mb_dataframe(self): # Arrange blob_name = settings.storage.medium_size_blob original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) original_dataframe = pd.read_csv(BytesIO(original_data), header=0, sep=",", encoding='utf-8-sig') self._write_blob_contents(blob_name, original_data) # Act start_time = datetime.now() writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe) elapsed_time = datetime.now() - start_time result_data = writer.getvalue() self._write_serialized_frame(blob_name, result_data) # Assert result_dataframe = pd.read_csv(BytesIO(result_data), header=0, sep=",", encoding='utf-8-sig') assert_frame_equal(original_dataframe, result_dataframe) self.assertLess(elapsed_time.total_seconds(), 10)
def rest_get_image(album_name, image_name, username): gallery_db = connect_to_db() albums = gallery_db.albums requested_album = albums.find_one({"name": album_name}) if not requested_album: return jsonify({'error': "album does not exist"}) if not (username in requested_album["write"] or username in requested_album["read"]): return jsonify({'error': "no permission to get images"}) if image_name not in requested_album["images"]: return jsonify({'error': "no such image in album"}) blob_service = BlobService(account_name = ACCOUNT_NAME, account_key = ACCOUNT_KEY) data = blob_service.get_blob_to_bytes(CONTAINER_NAME, image_name) response = make_response(data) response.headers["Content-Disposition"] = "attachment; filename=%s" % image_name return response
class AzureTransfer(BaseTransfer): def __init__(self, account_name, account_key, container_name, prefix=None): # NOTE: Azure wants all paths to start with a slash prefix = "/{}".format(prefix.lstrip("/") if prefix else "") super().__init__(prefix=prefix) self.account_name = account_name self.account_key = account_key self.container_name = container_name self.conn = BlobService(account_name=self.account_name, account_key=self.account_key) self.container = self.get_or_create_container(self.container_name) self.log.debug("AzureTransfer initialized") # XXX: AzureTransfer isn't actively tested and hasn't its error handling is probably lacking self.log.warning("AzureTransfer is experimental and has not been thoroughly tested") def get_metadata_for_key(self, key): key = self.format_key_for_backend(key) return self._list_blobs(key)[0]["metadata"] def _metadata_for_key(self, key): return self._list_blobs(key)[0]["metadata"] def list_path(self, key): path = self.format_key_for_backend(key, trailing_slash=True) return self._list_blobs(path) def _list_blobs(self, path): self.log.debug("Listing path %r", path) items = self.conn.list_blobs(self.container_name, prefix=path, delimiter="/", include="metadata") result = [] for item in items: result.append({ "last_modified": dateutil.parser.parse(item.properties.last_modified), "metadata": item.metadata, "name": self.format_key_from_backend(item.name), "size": item.properties.content_length, }) return result def delete_key(self, key): key = self.format_key_for_backend(key) self.log.debug("Deleting key: %r", key) return self.conn.delete_blob(self.container_name, key) def get_contents_to_file(self, key, filepath_to_store_to): key = self.format_key_for_backend(key) self.log.debug("Starting to fetch the contents of: %r to: %r", key, filepath_to_store_to) return self.conn.get_blob_to_path(self.container_name, key, filepath_to_store_to) def get_contents_to_fileobj(self, key, fileobj_to_store_to): key = self.format_key_for_backend(key) self.log.debug("Starting to fetch the contents of: %r", key) return self.conn.get_blob_to_file(self.container_name, key, fileobj_to_store_to) def get_contents_to_string(self, key): key = self.format_key_for_backend(key) self.log.debug("Starting to fetch the contents of: %r", key) return self.conn.get_blob_to_bytes(self.container_name, key), self._metadata_for_key(key) def store_file_from_memory(self, key, memstring, metadata=None): key = self.format_key_for_backend(key) # Azure requires all metadata keys and values to be strings metadata_to_send = {str(k): str(v) for k, v in metadata.items()} self.conn.put_block_blob_from_bytes(self.container_name, key, memstring, x_ms_meta_name_values=metadata_to_send) def store_file_from_disk(self, key, filepath, metadata=None, multipart=None): key = self.format_key_for_backend(key) # Azure requires all metadata keys and values to be strings metadata_to_send = {str(k): str(v) for k, v in metadata.items()} self.conn.put_block_blob_from_path(self.container_name, key, filepath, x_ms_meta_name_values=metadata_to_send) def get_or_create_container(self, container_name): start_time = time.time() self.conn.create_container(container_name) self.log.debug("Got/Created container: %r successfully, took: %.3fs", container_name, time.time() - start_time) return container_name
class AzureTransfer(BaseTransfer): def __init__(self, account_name, account_key, container_name, prefix=None): # NOTE: Azure wants all paths to start with a slash prefix = "/{}".format(prefix.lstrip("/") if prefix else "") super().__init__(prefix=prefix) self.account_name = account_name self.account_key = account_key self.container_name = container_name self.conn = BlobService(account_name=self.account_name, account_key=self.account_key) self.container = self.get_or_create_container(self.container_name) self.log.debug("AzureTransfer initialized") # XXX: AzureTransfer isn't actively tested and hasn't its error handling is probably lacking self.log.warning( "AzureTransfer is experimental and has not been thoroughly tested") def get_metadata_for_key(self, key): key = self.format_key_for_backend(key) return self._list_blobs(key)[0]["metadata"] def _metadata_for_key(self, key): return self._list_blobs(key)[0]["metadata"] def list_path(self, key): path = self.format_key_for_backend(key, trailing_slash=True) return self._list_blobs(path) def _list_blobs(self, path): self.log.debug("Listing path %r", path) items = self.conn.list_blobs(self.container_name, prefix=path, delimiter="/", include="metadata") result = [] for item in items: result.append({ "last_modified": dateutil.parser.parse(item.properties.last_modified), "metadata": item.metadata, "name": self.format_key_from_backend(item.name), "size": item.properties.content_length, }) return result def delete_key(self, key): key = self.format_key_for_backend(key) self.log.debug("Deleting key: %r", key) return self.conn.delete_blob(self.container_name, key) def get_contents_to_file(self, key, filepath_to_store_to): key = self.format_key_for_backend(key) self.log.debug("Starting to fetch the contents of: %r to: %r", key, filepath_to_store_to) return self.conn.get_blob_to_path(self.container_name, key, filepath_to_store_to) def get_contents_to_fileobj(self, key, fileobj_to_store_to): key = self.format_key_for_backend(key) self.log.debug("Starting to fetch the contents of: %r", key) return self.conn.get_blob_to_file(self.container_name, key, fileobj_to_store_to) def get_contents_to_string(self, key): key = self.format_key_for_backend(key) self.log.debug("Starting to fetch the contents of: %r", key) return self.conn.get_blob_to_bytes(self.container_name, key), self._metadata_for_key(key) def store_file_from_memory(self, key, memstring, metadata=None): key = self.format_key_for_backend(key) # Azure requires all metadata keys and values to be strings metadata_to_send = {str(k): str(v) for k, v in metadata.items()} self.conn.put_block_blob_from_bytes( self.container_name, key, memstring, x_ms_meta_name_values=metadata_to_send) def store_file_from_disk(self, key, filepath, metadata=None, multipart=None): key = self.format_key_for_backend(key) # Azure requires all metadata keys and values to be strings metadata_to_send = {str(k): str(v) for k, v in metadata.items()} self.conn.put_block_blob_from_path( self.container_name, key, filepath, x_ms_meta_name_values=metadata_to_send) def get_or_create_container(self, container_name): start_time = time.time() self.conn.create_container(container_name) self.log.debug("Got/Created container: %r successfully, took: %.3fs", container_name, time.time() - start_time) return container_name
class RoundTripTests(unittest.TestCase): def setUp(self): self.workspace = Workspace(settings.workspace.id, settings.workspace.token, settings.workspace.endpoint) self.blob = BlobService(settings.storage.account_name, settings.storage.account_key) def _write_blob_contents(self, filename, data): if settings.diagnostics.write_blob_contents: with open("original-blob-" + filename, "wb") as data_file: data_file.write(data) def _write_serialized_frame(self, filename, data): if settings.diagnostics.write_serialized_frame: with open("serialized-frame-" + filename, "wb") as data_file: data_file.write(data) def test_download_blob_then_upload_as_dataframe_then_read_dataset(self): def datatypeid_from_header_and_format(header, format): if format == "csv": if header == "wh": return DataTypeIds.GenericCSV else: return DataTypeIds.GenericCSVNoHeader elif format == "tsv": if header == "wh": return DataTypeIds.GenericTSV else: return DataTypeIds.GenericTSVNoHeader elif format == "txt": return DataTypeIds.PlainText else: self.assertTrue(False, "Unexpected format") def split_blob_name(blob_name): # blob naming convention: # name_<header>.<format> # <header>: WH: with header # NH: no header # <format>: CSV: comma separated # TSV: tab separated # TXT: newline separated name, format = blob_name.lower().split(".") if format != "txt": name, header = name.split("_") else: header = "nh" return name, format, header for blob_name in settings.storage.blobs: print(blob_name) name, format, header = split_blob_name(blob_name) # Read the data from blob storage original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) self._write_blob_contents(blob_name, original_data) # Parse the data to a dataframe using Pandas original_dataframe = pd.read_csv( BytesIO(original_data), header=0 if header == "wh" else None, sep="," if format == "csv" else "\t" if format == "tsv" else "\n", encoding="utf-8-sig", ) # Upload the dataframe as a new dataset dataset_name = "unittest" + name + id_generator() description = "safe to be deleted - " + dataset_name data_type_id = datatypeid_from_header_and_format(header, format) self.workspace.datasets.add_from_dataframe(original_dataframe, data_type_id, dataset_name, description) # Get the new dataset dataset = self.workspace.datasets[dataset_name] self.assertIsNotNone(dataset) # Read the dataset as a dataframe result_data = dataset.read_as_binary() self._write_serialized_frame(blob_name, result_data) result_dataframe = dataset.to_dataframe() # Verify that the dataframes are equal assert_frame_equal(original_dataframe, result_dataframe) def test_azureml_example_datasets(self): max_size = 10 * 1024 * 1024 skip = ["Restaurant feature data", "IMDB Movie Titles", "Book Reviews from Amazon"] for dataset in self.workspace.example_datasets: if not hasattr(dataset, "to_dataframe"): print("skipped (unsupported format): {0}".format(dataset.name)) continue if dataset.size > max_size: print("skipped (max size): {0}".format(dataset.name)) continue if dataset.name in skip: print("skipped: {0}".format(dataset.name)) continue print("downloading: " + dataset.name) frame = dataset.to_dataframe() print("uploading: " + dataset.name) dataset_name = "unittest" + dataset.name + id_generator() description = "safe to be deleted - " + dataset_name self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)
class RoundTripTests(unittest.TestCase): def setUp(self): self.workspace = Workspace( settings.workspace.id, settings.workspace.token, settings.workspace.endpoint ) self.blob = BlobService( settings.storage.account_name, settings.storage.account_key ) def _write_blob_contents(self, filename, data): if settings.diagnostics.write_blob_contents: with open('original-blob-' + filename, 'wb') as data_file: data_file.write(data) def _write_serialized_frame(self, filename, data): if settings.diagnostics.write_serialized_frame: with open('serialized-frame-' + filename, 'wb') as data_file: data_file.write(data) def test_download_blob_then_upload_as_dataframe_then_read_dataset(self): def datatypeid_from_header_and_format(header, format): if format == 'csv': if header == 'wh': return DataTypeIds.GenericCSV else: return DataTypeIds.GenericCSVNoHeader elif format == 'tsv': if header == 'wh': return DataTypeIds.GenericTSV else: return DataTypeIds.GenericTSVNoHeader elif format == 'txt': return DataTypeIds.PlainText else: self.assertTrue(False, 'Unexpected format') def split_blob_name(blob_name): # blob naming convention: # name_<header>.<format> # <header>: WH: with header # NH: no header # <format>: CSV: comma separated # TSV: tab separated # TXT: newline separated name, format = blob_name.lower().split('.') if format != 'txt': name, header = name.split('_') else: header = 'nh' return name, format, header for blob_name in settings.storage.blobs: print(blob_name) name, format, header = split_blob_name(blob_name) # Read the data from blob storage original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) self._write_blob_contents(blob_name, original_data) # Parse the data to a dataframe using Pandas original_dataframe = pd.read_csv( BytesIO(original_data), header=0 if header == 'wh' else None, sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n', encoding='utf-8-sig' ) # Upload the dataframe as a new dataset dataset_name = 'unittest' + name + id_generator() description = 'safe to be deleted - ' + dataset_name data_type_id = datatypeid_from_header_and_format(header, format) self.workspace.datasets.add_from_dataframe( original_dataframe, data_type_id, dataset_name, description, ) # Get the new dataset dataset = self.workspace.datasets[dataset_name] self.assertIsNotNone(dataset) # Read the dataset as a dataframe result_data = dataset.read_as_binary() self._write_serialized_frame(blob_name, result_data) result_dataframe = dataset.to_dataframe() # Verify that the dataframes are equal assert_frame_equal(original_dataframe, result_dataframe) def test_azureml_example_datasets(self): max_size = 10 * 1024 * 1024 skip = [ 'Restaurant feature data', 'IMDB Movie Titles', 'Book Reviews from Amazon', ] for dataset in self.workspace.example_datasets: if not hasattr(dataset, 'to_dataframe'): print('skipped (unsupported format): {0}'.format(dataset.name)) continue if dataset.size > max_size: print('skipped (max size): {0}'.format(dataset.name)) continue if dataset.name in skip: print('skipped: {0}'.format(dataset.name)) continue print('downloading: ' + dataset.name) frame = dataset.to_dataframe() print('uploading: ' + dataset.name) dataset_name = 'unittest' + dataset.name + id_generator() description = 'safe to be deleted - ' + dataset_name self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)