def test_compute_log_manager_subscription_updates(): from dagster.core.storage.local_compute_log_manager import LocalComputeLogManager with tempfile.TemporaryDirectory() as temp_dir: compute_log_manager = LocalComputeLogManager(temp_dir, polling_timeout=0.5) run_id = "fake_run_id" step_key = "spew" stdout_path = compute_log_manager.get_local_path(run_id, step_key, ComputeIOType.STDOUT) # make sure the parent directory to be watched exists, file exists ensure_dir(os.path.dirname(stdout_path)) touch_file(stdout_path) # set up the subscription messages = [] observable = compute_log_manager.observable(run_id, step_key, ComputeIOType.STDOUT) observable.subscribe(messages.append) # returns a single update, with 0 data assert len(messages) == 1 last_chunk = messages[-1] assert not last_chunk.data assert last_chunk.cursor == 0 with open(stdout_path, "a+") as f: print(HELLO_SOLID, file=f) # pylint:disable=print-call # wait longer than the watchdog timeout time.sleep(1) assert len(messages) == 2 last_chunk = messages[-1] assert last_chunk.data assert last_chunk.cursor > 0
def __init__( self, bucket, local_dir=None, inst_data=None, prefix="dagster", use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, skip_empty_files=False, ): _verify = False if not verify else verify_cert_path self._s3_session = boto3.resource( "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url).meta.client self._s3_bucket = check.str_param(bucket, "bucket") self._s3_prefix = check.str_param(prefix, "prefix") # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")
def __init__(self, local_dir, bucket=''): self._s3_session = create_s3_session() self._s3_bucket = check.str_param(bucket, 'bucket') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) self.local_manager = LocalComputeLogManager(local_dir)
def __init__( self, bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, ): _verify = False if not verify else verify_cert_path self._s3_session = boto3.resource( 's3', use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url).meta.client self._s3_bucket = check.str_param(bucket, 'bucket') self._s3_prefix = check.str_param(prefix, 'prefix') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData)
def __init__( self, bucket, local_dir=None, inst_data=None, prefix="dagster", json_credentials_envvar=None, ): self._bucket_name = check.str_param(bucket, "bucket") self._prefix = check.str_param(prefix, "prefix") if json_credentials_envvar: json_info_str = os.environ.get(json_credentials_envvar) credentials_info = json.loads(json_info_str) self._bucket = ( storage.Client() .from_service_account_info(credentials_info) .bucket(self._bucket_name) ) else: self._bucket = storage.Client().bucket(self._bucket_name) # Check if the bucket exists check.invariant(self._bucket.exists()) # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)
def __init__( self, storage_account, container, secret_key, local_dir=None, inst_data=None, prefix="dagster", ): self._storage_account = check.str_param(storage_account, "storage_account") self._container = check.str_param(container, "container") self._blob_prefix = check.str_param(prefix, "prefix") check.str_param(secret_key, "secret_key") self._blob_client = create_blob_client(storage_account, secret_key) self._container_client = self._blob_client.get_container_client( container) self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)
def __init__(self, bucket, local_dir=None, inst_data=None): self._s3_session = create_s3_session() self._s3_bucket = check.str_param(bucket, 'bucket') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) super(S3ComputeLogManager, self).__init__(inst_data=inst_data)
def __init__(self, bucket, local_dir=None, inst_data=None, prefix='dagster'): self._s3_session = create_s3_session() self._s3_bucket = check.str_param(bucket, 'bucket') self._s3_prefix = check.str_param(prefix, 'prefix') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData)
def from_ref(instance_ref, fallback_feature_set=None, watch_external_runs=False): check.inst_param(instance_ref, 'instance_ref', InstanceRef) check.opt_set_param(fallback_feature_set, 'fallback_feature_set', str) if isinstance(instance_ref, LocalInstanceRef): from dagster.core.storage.event_log import FilesystemEventLogStorage from dagster.core.storage.runs import FilesystemRunStorage from dagster.core.storage.local_compute_log_manager import LocalComputeLogManager feature_set = _dagster_feature_set(instance_ref.home_dir) or fallback_feature_set return DagsterInstance( instance_type=InstanceType.LOCAL, root_storage_dir=instance_ref.home_dir, run_storage=FilesystemRunStorage( _runs_directory(instance_ref.home_dir), watch_external_runs=watch_external_runs ), event_storage=FilesystemEventLogStorage(_runs_directory(instance_ref.home_dir)), compute_log_manager=LocalComputeLogManager( _compute_logs_base_directory(instance_ref.home_dir) ), feature_set=feature_set, ) else: check.failed('Unhandled instance type {}'.format(type(instance_ref)))
def test_fs_stores(): @pipeline def simple(): @solid def easy(context): context.log.info('easy') return 'easy' easy() with seven.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, ) result = execute_pipeline(simple, instance=instance) assert run_store.has_run(result.run_id) assert run_store.get_run_by_id(result.run_id).status == PipelineRunStatus.SUCCESS assert DagsterEventType.PIPELINE_SUCCESS in [ event.dagster_event.event_type for event in event_store.get_logs_for_run(result.run_id) if event.is_dagster_event ] stats = event_store.get_stats_for_run(result.run_id) assert stats.steps_succeeded == 1 assert stats.end_time is not None
def _dagster_compute_log_manager(base_dir): config = dagster_instance_config(base_dir) compute_log_base = os.path.join(base_dir, 'storage') if config and config.get('compute_logs'): if 'module' in config['compute_logs'] and 'class' in config[ 'compute_logs']: from dagster.core.storage.compute_log_manager import ComputeLogManager try: module = __import__(config['compute_logs']['module']) klass = getattr(module, config['compute_logs']['class']) check.subclass_param(klass, 'compute_log_manager', ComputeLogManager) kwargs = config['compute_logs'].get('config', {}) compute_log_manager = klass(compute_log_base, **kwargs) check.inst_param(compute_log_manager, 'compute_log_manager', ComputeLogManager) return compute_log_manager except Exception: raise DagsterInvariantViolationError( 'Invalid dagster config in `{config_yaml_filename}`. Expecting `module`, ' '`class`, and `config`, returning a valid instance of ' '`ComputeLogManager`'.format( config_yaml_filename=DAGSTER_CONFIG_YAML_FILENAME)) from dagster.core.storage.local_compute_log_manager import LocalComputeLogManager return LocalComputeLogManager(compute_log_base)
def test_postgres_instance(multi_postgres): run_storage_conn_string, event_log_storage_conn_string = multi_postgres run_storage = PostgresRunStorage.create_clean_storage( run_storage_conn_string) event_storage = PostgresEventLogStorage.create_clean_storage( event_log_storage_conn_string) with seven.TemporaryDirectory() as temp_dir: instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_storage, event_storage=event_storage, compute_log_manager=LocalComputeLogManager(temp_dir), ) result = execute_pipeline(simple, instance=instance) assert run_storage.has_run(result.run_id) assert run_storage.get_run_by_id( result.run_id).status == PipelineRunStatus.SUCCESS assert DagsterEventType.PIPELINE_SUCCESS in [ event.dagster_event.event_type for event in event_storage.get_logs_for_run(result.run_id) if event.is_dagster_event ] stats = event_storage.get_stats_for_run(result.run_id) assert stats.steps_succeeded == 1 assert stats.end_time is not None
def test_fs_stores(): @pipeline def simple(): @lambda_solid def easy(): return 'easy' easy() with seven.TemporaryDirectory() as temp_dir: run_store = FilesystemRunStorage(temp_dir) event_store = FilesystemEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.LOCAL, root_storage_dir=temp_dir, run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, ) run = RunConfig() execute_pipeline(simple, run_config=run, instance=instance) assert run_store.has_run(run.run_id) assert run_store.get_run_by_id( run.run_id).status == PipelineRunStatus.SUCCESS assert DagsterEventType.PIPELINE_SUCCESS in [ event.dagster_event.event_type for event in event_store.get_logs_for_run(run.run_id) ]
def __init__( self, bucket, local_dir=None, inst_data=None, prefix="dagster", ): self._bucket_name = check.str_param(bucket, "bucket") self._prefix = check.str_param(prefix, "prefix") self._bucket = storage.Client().get_bucket(self._bucket_name) # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)
def test_execute_display_command(): with tempfile.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = ConsolidatedSqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ) run_config = { "solids": { "create_string_1_asset": {"config": {"input_str": "apple"}}, "take_string_1_asset": {"config": {"input_str": "apple"}}, }, "resources": {"object_manager": {"config": {"base_dir": temp_dir}}}, } # write run config to temp file # file is temp because intermediate storage directory is temporary with open(os.path.join(temp_dir, "pipeline_config.yaml"), "w") as f: f.write(yaml.dump(run_config)) kwargs = { "config": (os.path.join(temp_dir, "pipeline_config.yaml"),), "pipeline": "asset_pipeline", "python_file": file_relative_path( __file__, "../../core_tests/execution_tests/memoized_dev_loop_pipeline.py" ), "tags": '{"dagster/is_memoized_run": "true"}', } with Capturing() as output: execute_list_versions_command(kwargs=kwargs, instance=instance) assert output # execute the pipeline once so that addresses have been populated. result = execute_pipeline( asset_pipeline, run_config=run_config, mode="only_mode", tags={"dagster/is_memoized_run": "true"}, instance=instance, ) assert result.success with Capturing() as output: execute_list_versions_command(kwargs=kwargs, instance=instance) assert output
def _sqlite_asset_instance(): with seven.TemporaryDirectory() as temp_dir: instance = DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=InMemoryRunStorage(), event_storage=ConsolidatedSqliteEventLogStorage(temp_dir), compute_log_manager=LocalComputeLogManager(temp_dir), run_launcher=SyncInMemoryRunLauncher(), ) yield instance
def _readonly_in_memory_instance(): with seven.TemporaryDirectory() as temp_dir: yield DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=InMemoryRunStorage(), event_storage=InMemoryEventLogStorage(), compute_log_manager=LocalComputeLogManager(temp_dir), run_launcher=ExplodingRunLauncher(), schedule_storage=SqliteScheduleStorage.from_local(temp_dir), )
def _in_memory_instance(): with tempfile.TemporaryDirectory() as temp_dir: yield DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=InMemoryRunStorage(), event_storage=InMemoryEventLogStorage(), compute_log_manager=LocalComputeLogManager(temp_dir), run_launcher=SyncInMemoryRunLauncher(), run_coordinator=DefaultRunCoordinator(), schedule_storage=SqliteScheduleStorage.from_local(temp_dir), )
def _sqlite_asset_instance(): with tempfile.TemporaryDirectory() as temp_dir: instance = DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=InMemoryRunStorage(), event_storage=ConsolidatedSqliteEventLogStorage(temp_dir), compute_log_manager=LocalComputeLogManager(temp_dir), run_coordinator=DefaultRunCoordinator(), run_launcher=SyncInMemoryRunLauncher(), scheduler=FilesystemTestScheduler(temp_dir), ) yield instance
def _postgres_instance(): with seven.TemporaryDirectory() as temp_dir: with graphql_postgres_instance() as pg_conn_string: yield DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=TestPostgresInstance.clean_run_storage(pg_conn_string), event_storage=TestPostgresInstance.clean_event_log_storage(pg_conn_string), compute_log_manager=LocalComputeLogManager(temp_dir), run_launcher=SyncInMemoryRunLauncher(), schedule_storage=TestPostgresInstance.clean_schedule_storage( pg_conn_string ), )
def get_ephemeral_instance(temp_dir): run_store = SqliteRunStorage.from_local(temp_dir) event_store = ConsolidatedSqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, run_launcher=DefaultRunLauncher(), run_coordinator=DefaultRunCoordinator(), ) return instance
def _postgres_instance_with_grpc_api_hijack(): with seven.TemporaryDirectory() as temp_dir: with graphql_postgres_instance() as pg_conn_string: instance = DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=TestPostgresInstance.clean_run_storage(pg_conn_string), event_storage=TestPostgresInstance.clean_event_log_storage(pg_conn_string), compute_log_manager=LocalComputeLogManager(temp_dir), run_launcher=EphemeralGrpcRunLauncher(), schedule_storage=TestPostgresInstance.clean_schedule_storage( pg_conn_string ), ) try: yield instance finally: instance.run_launcher.join()
def test_fs_stores(): @pipeline def simple(): @solid def easy(context): context.log.info("easy") return "easy" easy() with tempfile.TemporaryDirectory() as temp_dir: with environ({"DAGSTER_HOME": temp_dir}): run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ref=InstanceRef.from_dir(temp_dir), settings={"telemetry": { "enabled": False }}, ) result = execute_pipeline(simple, instance=instance) assert run_store.has_run(result.run_id) assert run_store.get_run_by_id( result.run_id).status == PipelineRunStatus.SUCCESS assert DagsterEventType.PIPELINE_SUCCESS in [ event.dagster_event.event_type for event in event_store.get_logs_for_run(result.run_id) if event.is_dagster_event ] stats = event_store.get_stats_for_run(result.run_id) assert stats.steps_succeeded == 1 assert stats.end_time is not None
class GCSComputeLogManager(ComputeLogManager, ConfigurableClass): """Logs op compute function stdout and stderr to GCS. Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml`` such as the following: .. code-block:: YAML compute_logs: module: dagster_gcp.gcs.compute_log_manager class: GCSComputeLogManager config: bucket: "mycorp-dagster-compute-logs" local_dir: "/tmp/cool" prefix: "dagster-test-" Args: bucket (str): The name of the gcs bucket to which to log. local_dir (Optional[str]): Path to the local directory in which to stage logs. Default: ``dagster.seven.get_system_temp_directory()``. prefix (Optional[str]): Prefix for the log file keys. json_credentials_envvar (Optional[str]): Env variable that contain the JSON with a private key and other credentials information. If this is set GOOGLE_APPLICATION_CREDENTIALS will be ignored. Can be used when the private key cannot be used as a file. inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute log manager when newed up from config. """ def __init__( self, bucket, local_dir=None, inst_data=None, prefix="dagster", json_credentials_envvar=None, ): self._bucket_name = check.str_param(bucket, "bucket") self._prefix = check.str_param(prefix, "prefix") if json_credentials_envvar: json_info_str = os.environ.get(json_credentials_envvar) credentials_info = json.loads(json_info_str) self._bucket = ( storage.Client() .from_service_account_info(credentials_info) .bucket(self._bucket_name) ) else: self._bucket = storage.Client().bucket(self._bucket_name) # Check if the bucket exists check.invariant(self._bucket.exists()) # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) @contextmanager def _watch_logs(self, pipeline_run, step_key=None): # proxy watching to the local compute log manager, interacting with the filesystem with self.local_manager._watch_logs( # pylint: disable=protected-access pipeline_run, step_key ): yield @property def inst_data(self): return self._inst_data @classmethod def config_type(cls): return { "bucket": StringSource, "local_dir": Field(StringSource, is_required=False), "prefix": Field(StringSource, is_required=False, default_value="dagster"), "json_credentials_envvar": Field(StringSource, is_required=False), } @staticmethod def from_config_value(inst_data, config_value): return GCSComputeLogManager(inst_data=inst_data, **config_value) def get_local_path(self, run_id, key, io_type): return self.local_manager.get_local_path(run_id, key, io_type) def on_watch_start(self, pipeline_run, step_key): self.local_manager.on_watch_start(pipeline_run, step_key) def on_watch_finish(self, pipeline_run, step_key): self.local_manager.on_watch_finish(pipeline_run, step_key) key = self.local_manager.get_key(pipeline_run, step_key) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR) def is_watch_completed(self, run_id, key): return self.local_manager.is_watch_completed(run_id, key) def download_url(self, run_id, key, io_type): if not self.is_watch_completed(run_id, key): return self.local_manager.download_url(run_id, key, io_type) url = self._bucket.blob(self._bucket_key(run_id, key, io_type)).generate_signed_url( expiration=3600 # match S3 default expiration ) return url def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): if self._should_download(run_id, key, io_type): self._download_to_local(run_id, key, io_type) data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes) return self._from_local_file_data(run_id, key, io_type, data) def on_subscribe(self, subscription): self.local_manager.on_subscribe(subscription) def on_unsubscribe(self, subscription): self.local_manager.on_unsubscribe(subscription) def _should_download(self, run_id, key, io_type): local_path = self.get_local_path(run_id, key, io_type) if os.path.exists(local_path): return False return self._bucket.blob(self._bucket_key(run_id, key, io_type)).exists() def _from_local_file_data(self, run_id, key, io_type, local_file_data): is_complete = self.is_watch_completed(run_id, key) path = ( "gs://{}/{}".format(self._bucket_name, self._bucket_key(run_id, key, io_type)) if is_complete else local_file_data.path ) return ComputeLogFileData( path, local_file_data.data, local_file_data.cursor, local_file_data.size, self.download_url(run_id, key, io_type), ) def _upload_from_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_file(path) with open(path, "rb") as data: self._bucket.blob(self._bucket_key(run_id, key, io_type)).upload_from_file(data) def _download_to_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_dir(os.path.dirname(path)) with open(path, "wb") as fileobj: self._bucket.blob(self._bucket_key(run_id, key, io_type)).download_to_file(fileobj) def _bucket_key(self, run_id, key, io_type): check.inst_param(io_type, "io_type", ComputeIOType) extension = IO_TYPE_EXTENSION[io_type] paths = [ self._prefix, "storage", run_id, "compute_logs", "{}.{}".format(key, extension), ] return "/".join(paths) # path delimiter def dispose(self): self.local_manager.dispose()
class AzureBlobComputeLogManager(ComputeLogManager, ConfigurableClass): '''Logs solid compute function stdout and stderr to Azure Blob Storage. This is also compatible with Azure Data Lake Storage. Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml`` such as the following: .. code-block:: YAML compute_logs: module: dagster_azure.blob.compute_log_manager class: AzureBlobComputeLogManager config: storage_account: my-storage-account container: my-container credential: sas-token-or-secret-key prefix: "dagster-test-" local_dir: "/tmp/cool" Args: storage_account (str): The storage account name to which to log. container (str): The container (or ADLS2 filesystem) to which to log. secret_key (str): Secret key for the storage account. SAS tokens are not supported because we need a secret key to generate a SAS token for a download URL. local_dir (Optional[str]): Path to the local directory in which to stage logs. Default: ``dagster.seven.get_system_temp_directory()``. prefix (Optional[str]): Prefix for the log file keys. inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute log manager when newed up from config. ''' def __init__( self, storage_account, container, secret_key, local_dir=None, inst_data=None, prefix='dagster', ): self._storage_account = check.str_param(storage_account, 'storage_account') self._container = check.str_param(container, 'container') self._blob_prefix = check.str_param(prefix, 'prefix') check.str_param(secret_key, 'secret_key') self._blob_client = create_blob_client(storage_account, secret_key) self._container_client = self._blob_client.get_container_client(container) self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData) @contextmanager def _watch_logs(self, pipeline_run, step_key=None): # proxy watching to the local compute log manager, interacting with the filesystem with self.local_manager._watch_logs( # pylint: disable=protected-access pipeline_run, step_key ): yield @property def inst_data(self): return self._inst_data @classmethod def config_type(cls): return { 'storage_account': str, 'container': str, 'secret_key': str, 'local_dir': Field(str, is_required=False), 'prefix': Field(str, is_required=False, default_value='dagster'), } @staticmethod def from_config_value(inst_data, config_value): return AzureBlobComputeLogManager(inst_data=inst_data, **config_value) def get_local_path(self, run_id, key, io_type): return self.local_manager.get_local_path(run_id, key, io_type) def on_watch_start(self, pipeline_run, step_key): self.local_manager.on_watch_start(pipeline_run, step_key) def on_watch_finish(self, pipeline_run, step_key): self.local_manager.on_watch_finish(pipeline_run, step_key) key = self.local_manager.get_key(pipeline_run, step_key) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR) def is_watch_completed(self, run_id, key): return self.local_manager.is_watch_completed(run_id, key) def download_url(self, run_id, key, io_type): if not self.is_watch_completed(run_id, key): return self.local_manager.download_url(run_id, key, io_type) key = self._blob_key(run_id, key, io_type) if key in self._download_urls: return self._download_urls[key] blob = self._container_client.get_blob_client(key) sas = generate_blob_sas( self._storage_account, self._container, key, account_key=self._blob_client.credential.account_key, ) url = blob.url + sas self._download_urls[key] = url return url def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): if self._should_download(run_id, key, io_type): self._download_to_local(run_id, key, io_type) data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes) return self._from_local_file_data(run_id, key, io_type, data) def on_subscribe(self, subscription): self.local_manager.on_subscribe(subscription) def _should_download(self, run_id, key, io_type): local_path = self.get_local_path(run_id, key, io_type) if os.path.exists(local_path): return False blob_objects = self._container_client.list_blobs(self._blob_key(run_id, key, io_type)) # Limit the generator to avoid paging since we only need one element # to return True limited_blob_objects = itertools.islice(blob_objects, 1) return len(list(limited_blob_objects)) > 0 def _from_local_file_data(self, run_id, key, io_type, local_file_data): is_complete = self.is_watch_completed(run_id, key) path = ( 'https://{account}.blob.core.windows.net/{container}/{key}'.format( account=self._storage_account, container=self._container, key=self._blob_key(run_id, key, io_type), ) if is_complete else local_file_data.path ) return ComputeLogFileData( path, local_file_data.data, local_file_data.cursor, local_file_data.size, self.download_url(run_id, key, io_type), ) def _upload_from_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_file(path) key = self._blob_key(run_id, key, io_type) with open(path, 'rb') as data: blob = self._container_client.get_blob_client(key) blob.upload_blob(data) def _download_to_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_dir(os.path.dirname(path)) key = self._blob_key(run_id, key, io_type) with open(path, 'wb') as fileobj: blob = self._container_client.get_blob_client(key) blob.download_blob().readinto(fileobj) def _blob_key(self, run_id, key, io_type): check.inst_param(io_type, 'io_type', ComputeIOType) extension = IO_TYPE_EXTENSION[io_type] paths = [ self._blob_prefix, 'storage', run_id, 'compute_logs', '{}.{}'.format(key, extension), ] return '/'.join(paths) # blob path delimiter
class S3ComputeLogManager(ComputeLogManager, ConfigurableClass): '''Logs solid compute function stdout and stderr to S3. Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml`` such as the following: .. code-block:: YAML compute_log_manager: module: dagster_aws.s3.compute_log_manager class: S3ComputeLogManager config: bucket: "mycorp-dagster-compute-logs" local_dir: "/tmp/cool" prefix: "dagster-test-" Args: bucket (str): The name of the s3 bucket to which to log. local_dir (Optional[str]): Path to the local directory in which to stage logs. Default: ``dagster.seven.get_system_temp_directory()``. prefix (Optional[str]): Prefix for the log file keys. inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute log manager when newed up from config. ''' def __init__(self, bucket, local_dir=None, inst_data=None, prefix='dagster'): self._s3_session = create_s3_session() self._s3_bucket = check.str_param(bucket, 'bucket') self._s3_prefix = check.str_param(prefix, 'prefix') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData) @property def inst_data(self): return self._inst_data @classmethod def config_type(cls): return { 'bucket': str, 'local_dir': Field(str, is_required=False), 'prefix': Field(str, is_required=False, default_value='dagster'), } @staticmethod def from_config_value(inst_data, config_value): return S3ComputeLogManager(inst_data=inst_data, **config_value) def get_local_path(self, run_id, step_key, io_type): return self.local_manager.get_local_path(run_id, step_key, io_type) def on_compute_start(self, step_context): self.local_manager.on_compute_start(step_context) def on_compute_finish(self, step_context): self.local_manager.on_compute_finish(step_context) self._upload_from_local(step_context.run_id, step_context.step.key, ComputeIOType.STDOUT) self._upload_from_local(step_context.run_id, step_context.step.key, ComputeIOType.STDERR) def is_compute_completed(self, run_id, step_key): return self.local_manager.is_compute_completed(run_id, step_key) def download_url(self, run_id, step_key, io_type): if not self.is_compute_completed(run_id, step_key): return self.local_manager.download_url(run_id, step_key, io_type) key = self._bucket_key(run_id, step_key, io_type) if key in self._download_urls: return self._download_urls[key] url = self._s3_session.generate_presigned_url( ClientMethod='get_object', Params={ 'Bucket': self._s3_bucket, 'Key': key }) self._download_urls[key] = url return url def read_logs_file(self, run_id, step_key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): if self._should_download(run_id, step_key, io_type): self._download_to_local(run_id, step_key, io_type) data = self.local_manager.read_logs_file(run_id, step_key, io_type, cursor, max_bytes) return self._from_local_file_data(run_id, step_key, io_type, data) def on_subscribe(self, subscription): self.local_manager.on_subscribe(subscription) def _should_download(self, run_id, step_key, io_type): local_path = self.get_local_path(run_id, step_key, io_type) if os.path.exists(local_path): return False s3_objects = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=self._bucket_key( run_id, step_key, io_type)) return len(s3_objects) > 0 def _from_local_file_data(self, run_id, step_key, io_type, local_file_data): is_complete = self.is_compute_completed(run_id, step_key) path = ('s3://{}/{}'.format( self._s3_bucket, self._bucket_key(run_id, step_key, io_type)) if is_complete else local_file_data.path) return ComputeLogFileData( path, local_file_data.data, local_file_data.cursor, local_file_data.size, self.download_url(run_id, step_key, io_type), ) def _upload_from_local(self, run_id, step_key, io_type): path = self.get_local_path(run_id, step_key, io_type) ensure_file(path) key = self._bucket_key(run_id, step_key, io_type) with open(path, 'rb') as data: self._s3_session.upload_fileobj(data, self._s3_bucket, key) def _download_to_local(self, run_id, step_key, io_type): path = self.get_local_path(run_id, step_key, io_type) ensure_dir(os.path.dirname(path)) with open(path, 'wb') as fileobj: self._s3_session.download_fileobj( self._s3_bucket, self._bucket_key(run_id, step_key, io_type), fileobj) def _bucket_key(self, run_id, step_key, io_type): check.inst_param(io_type, 'io_type', ComputeIOType) extension = IO_TYPE_EXTENSION[io_type] paths = [ self._s3_prefix, 'storage', run_id, 'compute_logs', '{}.{}'.format(step_key, extension), ] return '/'.join(paths) # s3 path delimiter
def test_dev_loop_changing_versions(): with seven.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = ConsolidatedSqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, run_launcher=CliApiRunLauncher(), ) run_config = { "solids": { "create_string_1": {"config": {"input_str": "apple", "base_dir": temp_dir}}, "create_string_2": {"config": {"input_str": "apple", "base_dir": temp_dir}}, "take_string_1": {"config": {"input_str": "apple", "base_dir": temp_dir}}, "take_string_2": {"config": {"input_str": "apple", "base_dir": temp_dir}}, "take_string_two_inputs": {"config": {"input_str": "apple", "base_dir": temp_dir}}, }, "intermediate_storage": {"filesystem": {"config": {"base_dir": temp_dir}}}, } result = execute_pipeline( basic_pipeline, run_config=run_config, mode="only_mode", tags={"dagster/is_memoized_run": "true"}, instance=instance, ) assert result.success assert not get_step_keys_to_execute(instance, basic_pipeline, run_config, "only_mode") run_config["solids"]["take_string_1"]["config"]["input_str"] = "banana" assert set( get_step_keys_to_execute(instance, basic_pipeline, run_config, "only_mode") ) == set(["take_string_1.compute", "take_string_two_inputs.compute"]) result2 = execute_pipeline( basic_pipeline, run_config=run_config, mode="only_mode", tags={"dagster/is_memoized_run": "true"}, instance=instance, ) assert result2.success assert not get_step_keys_to_execute(instance, basic_pipeline, run_config, "only_mode") run_config["solids"]["take_string_two_inputs"]["config"]["input_str"] = "banana" assert get_step_keys_to_execute(instance, basic_pipeline, run_config, "only_mode") == [ "take_string_two_inputs.compute" ] result3 = execute_pipeline( basic_pipeline, run_config=run_config, mode="only_mode", tags={"dagster/is_memoized_run": "true"}, instance=instance, ) assert result3.success assert not get_step_keys_to_execute(instance, basic_pipeline, run_config, "only_mode")
class S3ComputeLogManager(ComputeLogManager, ConfigurableClass): """Logs compute function stdout and stderr to S3. Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml`` such as the following: .. code-block:: YAML compute_logs: module: dagster_aws.s3.compute_log_manager class: S3ComputeLogManager config: bucket: "mycorp-dagster-compute-logs" local_dir: "/tmp/cool" prefix: "dagster-test-" use_ssl: true verify: true verify_cert_path: "/path/to/cert/bundle.pem" endpoint_url: "http://alternate-s3-host.io" skip_empty_files: true Args: bucket (str): The name of the s3 bucket to which to log. local_dir (Optional[str]): Path to the local directory in which to stage logs. Default: ``dagster.seven.get_system_temp_directory()``. prefix (Optional[str]): Prefix for the log file keys. use_ssl (Optional[bool]): Whether or not to use SSL. Default True. verify (Optional[bool]): Whether or not to verify SSL certificates. Default True. verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if `verify` set to False. endpoint_url (Optional[str]): Override for the S3 endpoint url. skip_empty_files: (Optional[bool]): Skip upload of empty log files. inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute log manager when newed up from config. """ def __init__( self, bucket, local_dir=None, inst_data=None, prefix="dagster", use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, skip_empty_files=False, ): _verify = False if not verify else verify_cert_path self._s3_session = boto3.resource( "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url).meta.client self._s3_bucket = check.str_param(bucket, "bucket") self._s3_prefix = check.str_param(prefix, "prefix") # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files") @contextmanager def _watch_logs(self, pipeline_run, step_key=None): # proxy watching to the local compute log manager, interacting with the filesystem with self.local_manager._watch_logs( # pylint: disable=protected-access pipeline_run, step_key): yield @property def inst_data(self): return self._inst_data @classmethod def config_type(cls): return { "bucket": StringSource, "local_dir": Field(StringSource, is_required=False), "prefix": Field(StringSource, is_required=False, default_value="dagster"), "use_ssl": Field(bool, is_required=False, default_value=True), "verify": Field(bool, is_required=False, default_value=True), "verify_cert_path": Field(StringSource, is_required=False), "endpoint_url": Field(StringSource, is_required=False), "skip_empty_files": Field(bool, is_required=False, default_value=False), } @staticmethod def from_config_value(inst_data, config_value): return S3ComputeLogManager(inst_data=inst_data, **config_value) def get_local_path(self, run_id, key, io_type): return self.local_manager.get_local_path(run_id, key, io_type) def on_watch_start(self, pipeline_run, step_key): self.local_manager.on_watch_start(pipeline_run, step_key) def on_watch_finish(self, pipeline_run, step_key): self.local_manager.on_watch_finish(pipeline_run, step_key) key = self.local_manager.get_key(pipeline_run, step_key) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR) def is_watch_completed(self, run_id, key): return self.local_manager.is_watch_completed(run_id, key) def download_url(self, run_id, key, io_type): if not self.is_watch_completed(run_id, key): return self.local_manager.download_url(run_id, key, io_type) key = self._bucket_key(run_id, key, io_type) url = self._s3_session.generate_presigned_url( ClientMethod="get_object", Params={ "Bucket": self._s3_bucket, "Key": key }) return url def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): if self._should_download(run_id, key, io_type): self._download_to_local(run_id, key, io_type) data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes) return self._from_local_file_data(run_id, key, io_type, data) def on_subscribe(self, subscription): self.local_manager.on_subscribe(subscription) def on_unsubscribe(self, subscription): self.local_manager.on_unsubscribe(subscription) def _should_download(self, run_id, key, io_type): local_path = self.get_local_path(run_id, key, io_type) if os.path.exists(local_path): return False try: # https://stackoverflow.com/a/38376288/14656695 self._s3_session.head_object(Bucket=self._s3_bucket, Key=self._bucket_key( run_id, key, io_type)) except ClientError: return False return True def _from_local_file_data(self, run_id, key, io_type, local_file_data): is_complete = self.is_watch_completed(run_id, key) path = ("s3://{}/{}".format(self._s3_bucket, self._bucket_key(run_id, key, io_type)) if is_complete else local_file_data.path) return ComputeLogFileData( path, local_file_data.data, local_file_data.cursor, local_file_data.size, self.download_url(run_id, key, io_type), ) def _upload_from_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_file(path) if self._skip_empty_files and os.stat(path).st_size == 0: return key = self._bucket_key(run_id, key, io_type) with open(path, "rb") as data: self._s3_session.upload_fileobj(data, self._s3_bucket, key) def _download_to_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_dir(os.path.dirname(path)) with open(path, "wb") as fileobj: self._s3_session.download_fileobj( self._s3_bucket, self._bucket_key(run_id, key, io_type), fileobj) def _bucket_key(self, run_id, key, io_type): check.inst_param(io_type, "io_type", ComputeIOType) extension = IO_TYPE_EXTENSION[io_type] paths = [ self._s3_prefix, "storage", run_id, "compute_logs", "{}.{}".format(key, extension), ] return "/".join(paths) # s3 path delimiter def dispose(self): self.local_manager.dispose()
class S3ComputeLogManager(ComputeLogManager, ConfigurableClass): def __init__(self, bucket, local_dir=None, inst_data=None, prefix='dagster'): self._s3_session = create_s3_session() self._s3_bucket = check.str_param(bucket, 'bucket') self._s3_prefix = check.str_param(prefix, 'prefix') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData) @property def inst_data(self): return self._inst_data @classmethod def config_type(cls): return { 'bucket': str, 'local_dir': Field(str, is_optional=True), 'prefix': Field(str, is_optional=True, default_value='dagster'), } @staticmethod def from_config_value(inst_data, config_value): return S3ComputeLogManager(inst_data=inst_data, **config_value) def get_local_path(self, run_id, step_key, io_type): return self.local_manager.get_local_path(run_id, step_key, io_type) def on_compute_start(self, step_context): self.local_manager.on_compute_start(step_context) def on_compute_finish(self, step_context): self.local_manager.on_compute_finish(step_context) self._upload_from_local(step_context.run_id, step_context.step.key, ComputeIOType.STDOUT) self._upload_from_local(step_context.run_id, step_context.step.key, ComputeIOType.STDERR) def is_compute_completed(self, run_id, step_key): return self.local_manager.is_compute_completed(run_id, step_key) def download_url(self, run_id, step_key, io_type): if not self.is_compute_completed(run_id, step_key): return self.local_manager.download_url(run_id, step_key, io_type) key = self._bucket_key(run_id, step_key, io_type) if key in self._download_urls: return self._download_urls[key] url = self._s3_session.generate_presigned_url( ClientMethod='get_object', Params={ 'Bucket': self._s3_bucket, 'Key': key }) self._download_urls[key] = url return url def read_logs_file(self, run_id, step_key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): if self._should_download(run_id, step_key, io_type): self._download_to_local(run_id, step_key, io_type) data = self.local_manager.read_logs_file(run_id, step_key, io_type, cursor, max_bytes) return self._from_local_file_data(run_id, step_key, io_type, data) def on_subscribe(self, subscription): self.local_manager.on_subscribe(subscription) def _should_download(self, run_id, step_key, io_type): local_path = self.get_local_path(run_id, step_key, io_type) if os.path.exists(local_path): return False s3_objects = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=self._bucket_key( run_id, step_key, io_type)) return len(s3_objects) > 0 def _from_local_file_data(self, run_id, step_key, io_type, local_file_data): is_complete = self.is_compute_completed(run_id, step_key) path = ('s3://{}/{}'.format( self._s3_bucket, self._bucket_key(run_id, step_key, io_type)) if is_complete else local_file_data.path) return ComputeLogFileData( path, local_file_data.data, local_file_data.cursor, local_file_data.size, self.download_url(run_id, step_key, io_type), ) def _upload_from_local(self, run_id, step_key, io_type): path = self.get_local_path(run_id, step_key, io_type) ensure_file(path) key = self._bucket_key(run_id, step_key, io_type) with open(path, 'rb') as data: self._s3_session.upload_fileobj(data, self._s3_bucket, key) def _download_to_local(self, run_id, step_key, io_type): path = self.get_local_path(run_id, step_key, io_type) ensure_dir(os.path.dirname(path)) with open(path, 'wb') as fileobj: self._s3_session.download_fileobj( self._s3_bucket, self._bucket_key(run_id, step_key, io_type), fileobj) def _bucket_key(self, run_id, step_key, io_type): check.inst_param(io_type, 'io_type', ComputeIOType) extension = IO_TYPE_EXTENSION[io_type] paths = [ self._s3_prefix, 'storage', run_id, 'compute_logs', '{}.{}'.format(step_key, extension), ] return '/'.join(paths) # s3 path delimiter
class S3ComputeLogManager(ComputeLogManager, ConfigurableClass): '''Logs solid compute function stdout and stderr to S3. Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml`` such as the following: .. code-block:: YAML compute_log_manager: module: dagster_aws.s3.compute_log_manager class: S3ComputeLogManager config: bucket: "mycorp-dagster-compute-logs" local_dir: "/tmp/cool" prefix: "dagster-test-" use_ssl: true verify: true verify_cert_path: "/path/to/cert/bundle.pem" endpoint_url: "http://alternate-s3-host.io" Args: bucket (str): The name of the s3 bucket to which to log. local_dir (Optional[str]): Path to the local directory in which to stage logs. Default: ``dagster.seven.get_system_temp_directory()``. prefix (Optional[str]): Prefix for the log file keys. use_ssl (Optional[bool]): Whether or not to use SSL. Default True. verify (Optional[bool]): Whether or not to verify SSL certificates. Default True. verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if `verify` set to False. endpoint_url (Optional[str]): Override for the S3 endpoint url. inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute log manager when newed up from config. ''' def __init__( self, bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, ): _verify = False if not verify else verify_cert_path self._s3_session = boto3.resource( 's3', use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url).meta.client self._s3_bucket = check.str_param(bucket, 'bucket') self._s3_prefix = check.str_param(prefix, 'prefix') self._download_urls = {} # proxy calls to local compute log manager (for subscriptions, etc) if not local_dir: local_dir = seven.get_system_temp_directory() self.local_manager = LocalComputeLogManager(local_dir) self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData) @contextmanager def _watch_logs(self, pipeline_run, step_key=None): # proxy watching to the local compute log manager, interacting with the filesystem with self.local_manager._watch_logs( # pylint: disable=protected-access pipeline_run, step_key): yield @property def inst_data(self): return self._inst_data @classmethod def config_type(cls): return { 'bucket': str, 'local_dir': Field(str, is_required=False), 'prefix': Field(str, is_required=False, default_value='dagster'), 'use_ssl': Field(bool, is_required=False, default_value=True), 'verify': Field(bool, is_required=False, default_value=True), 'verify_cert_path': Field(str, is_required=False), 'endpoint_url': Field(str, is_required=False), } @staticmethod def from_config_value(inst_data, config_value): return S3ComputeLogManager(inst_data=inst_data, **config_value) def get_local_path(self, run_id, key, io_type): return self.local_manager.get_local_path(run_id, key, io_type) def on_watch_start(self, pipeline_run, step_key): self.local_manager.on_watch_start(pipeline_run, step_key) def on_watch_finish(self, pipeline_run, step_key): self.local_manager.on_watch_finish(pipeline_run, step_key) key = self.local_manager.get_key(pipeline_run, step_key) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT) self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR) def is_watch_completed(self, run_id, key): return self.local_manager.is_watch_completed(run_id, key) def download_url(self, run_id, key, io_type): if not self.is_watch_completed(run_id, key): return self.local_manager.download_url(run_id, key, io_type) key = self._bucket_key(run_id, key, io_type) if key in self._download_urls: return self._download_urls[key] url = self._s3_session.generate_presigned_url( ClientMethod='get_object', Params={ 'Bucket': self._s3_bucket, 'Key': key }) self._download_urls[key] = url return url def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): if self._should_download(run_id, key, io_type): self._download_to_local(run_id, key, io_type) data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes) return self._from_local_file_data(run_id, key, io_type, data) def on_subscribe(self, subscription): self.local_manager.on_subscribe(subscription) def _should_download(self, run_id, key, io_type): local_path = self.get_local_path(run_id, key, io_type) if os.path.exists(local_path): return False s3_objects = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=self._bucket_key( run_id, key, io_type)) return len(s3_objects) > 0 def _from_local_file_data(self, run_id, key, io_type, local_file_data): is_complete = self.is_watch_completed(run_id, key) path = ('s3://{}/{}'.format(self._s3_bucket, self._bucket_key(run_id, key, io_type)) if is_complete else local_file_data.path) return ComputeLogFileData( path, local_file_data.data, local_file_data.cursor, local_file_data.size, self.download_url(run_id, key, io_type), ) def _upload_from_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_file(path) key = self._bucket_key(run_id, key, io_type) with open(path, 'rb') as data: self._s3_session.upload_fileobj(data, self._s3_bucket, key) def _download_to_local(self, run_id, key, io_type): path = self.get_local_path(run_id, key, io_type) ensure_dir(os.path.dirname(path)) with open(path, 'wb') as fileobj: self._s3_session.download_fileobj( self._s3_bucket, self._bucket_key(run_id, key, io_type), fileobj) def _bucket_key(self, run_id, key, io_type): check.inst_param(io_type, 'io_type', ComputeIOType) extension = IO_TYPE_EXTENSION[io_type] paths = [ self._s3_prefix, 'storage', run_id, 'compute_logs', '{}.{}'.format(key, extension), ] return '/'.join(paths) # s3 path delimiter