def download_from_s3(context): (bucket, key, target_folder, skip_if_present) = (context.solid_config.get(k) for k in ('bucket', 'key', 'target_folder', 'skip_if_present')) # file name is S3 key path suffix after last / target_file = os.path.join(target_folder, key.split('/')[-1]) if skip_if_present and safe_isfile(target_file): context.log.info( 'Skipping download, file already present at {target_file}'.format( target_file=target_file)) else: if not os.path.exists(target_folder): mkdir_p(target_folder) context.log.info( 'Starting download of {bucket}/{key} to {target_file}'.format( bucket=bucket, key=key, target_file=target_file)) s3 = boto3.client('s3') headers = s3.head_object(Bucket=bucket, Key=key) logger = S3Logger(context.log.debug, bucket, key, target_file, int(headers['ContentLength'])) s3.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger) return target_file
def set_asset(self, context, step_output_handle, obj, asset_metadata): """Pickle the data and store the object to a custom file path. This method emits an AssetMaterialization event so the assets will be tracked by the Asset Catalog. """ check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle) path = check.str_param(asset_metadata.get("path"), "asset_metadata.path") filepath = self._get_path(path) # Ensure path exists mkdir_p(os.path.dirname(filepath)) with open(filepath, self.write_mode) as write_obj: pickle.dump(obj, write_obj, PICKLE_PROTOCOL) return AssetMaterialization( asset_key=AssetKey( [ context.pipeline_def.name, step_output_handle.step_key, step_output_handle.output_name, ] ), metadata_entries=[EventMetadataEntry.fspath(os.path.abspath(filepath))], )
def _load_schedules(self): utils.mkdir_p(self._artifacts_dir) for file in os.listdir(self._artifacts_dir): if not file.endswith('.json'): continue file_path = os.path.join(self._artifacts_dir, file) with open(file_path) as data: try: data = seven.json.load(data) schedule = RunningSchedule( data['schedule_id'], ScheduleDefinition( name=data['name'], cron_schedule=data['cron_schedule'], execution_params=data['execution_params'], ), python_path=data['python_path'], repository_path=data['repository_path'], ) self._schedules[ schedule.schedule_definition.name] = schedule except Exception as ex: # pylint: disable=broad-except six.raise_from( Exception( 'Could not parse dagit schedule from {file_name} in {dir_name}. {ex}: {msg}' .format( file_name=file, dir_name=self._artifacts_dir, ex=type(ex).__name__, msg=ex, )), ex, )
def __init__(self, base_dir, inst_data=None): """Note that idempotent initialization of the SQLite database is done on a per-run_id basis in the body of connect, since each run is stored in a separate database.""" self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir")) mkdir_p(self._base_dir) self._obs = None self._watchers = defaultdict(dict) self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) # Used to ensure that each run ID attempts to initialize its DB the first time it connects, # ensuring that the database will be created if it doesn't exist self._initialized_dbs = set() # Ensure that multiple threads (like the event log watcher) interact safely with each other self._db_lock = threading.Lock() if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)): conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME) engine = create_engine(conn_string, poolclass=NullPool) self._initdb(engine) self.reindex() super().__init__()
def get_config_dir(config_yaml=None): instance = DagsterInstance.get() config_type = celery_executor.config_field.config_type config_value = get_config_value_from_yaml(config_yaml) config_module_name = 'dagster_celery_config' config_dir = os.path.join(instance.root_directory, 'dagster_celery', 'config', str(uuid.uuid4())) mkdir_p(config_dir) config_path = os.path.join( config_dir, '{config_module_name}.py'.format( config_module_name=config_module_name)) validated_config = validate_config(config_type, config_value).value with open(config_path, 'w') as fd: if 'broker' in validated_config: fd.write('broker_url = \'{broker_url}\'\n'.format( broker_url=str(validated_config['broker']))) if 'backend' in validated_config: fd.write('result_backend = \'{result_backend}\'\n'.format( result_backend=str(validated_config['backend']))) if 'config_source' in validated_config: for key, value in validated_config['config_source'].items(): fd.write('{key} = {value}\n'.format(key=key, value=repr(value))) # n.b. right now we don't attempt to clean up this cache, but it might make sense to delete # any files older than some time if there are more than some number of files present, etc. return config_dir
def download_file(self, context, target_file): check.str_param(target_file, 'target_file') target_path = os.path.join(self.target_folder, target_file) if self.skip_if_present and safe_isfile(target_path): context.log.info( 'Skipping download, file already present at {target_path}'. format(target_path=target_path)) else: full_key = self.key + '/' + target_file if os.path.dirname(target_path): mkdir_p(os.path.dirname(target_path)) context.log.info( 'Starting download of {bucket}/{key} to {target_path}'.format( bucket=self.bucket, key=full_key, target_path=target_path)) headers = context.resources.s3.head_object(Bucket=self.bucket, Key=full_key) logger = S3Logger(context.log.debug, self.bucket, full_key, target_path, int(headers['ContentLength'])) context.resources.s3.download_file(Bucket=self.bucket, Key=full_key, Filename=target_path, Callback=logger) return target_path
def handle_output(self, context, obj): """Pickle the data and store the object to a custom file path. This method emits an AssetMaterialization event so the assets will be tracked by the Asset Catalog. """ check.inst_param(context, "context", OutputContext) metadata = context.metadata path = check.str_param(metadata.get("path"), "metadata.path") filepath = self._get_path(path) # Ensure path exists mkdir_p(os.path.dirname(filepath)) context.log.debug(f"Writing file at: {filepath}") with open(filepath, self.write_mode) as write_obj: pickle.dump(obj, write_obj, PICKLE_PROTOCOL) return AssetMaterialization( asset_key=AssetKey( [context.pipeline_name, context.step_key, context.name]), metadata_entries=[ EventMetadataEntry.fspath(os.path.abspath(filepath)) ], )
def set_object(self, key, obj, serialization_strategy=DEFAULT_SERIALIZATION_STRATEGY): check.str_param(key, 'key') # obj is an arbitrary Python object check.inst_param(serialization_strategy, 'serialization_strategy', SerializationStrategy) if os.path.exists(key): logging.warning('Removing existing path {path}'.format(path=key)) os.unlink(key) # Ensure path exists mkdir_p(os.path.dirname(key)) serialization_strategy.serialize_to_file(obj, key) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=key, dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def from_local(base_dir, inst_data=None): check.str_param(base_dir, 'base_dir') mkdir_p(base_dir) conn_string = 'sqlite:///{}'.format(os.path.join(base_dir, 'runs.db')) engine = create_engine(conn_string) RunStorageSQLMetadata.create_all(engine) return SqliteRunStorage(conn_string, inst_data)
def _load_schedules(self): schedules_dir = os.path.join(self._base_dir) utils.mkdir_p(schedules_dir) for repository_name in os.listdir(schedules_dir): if not os.path.isdir(os.path.join(schedules_dir, repository_name)): continue self._schedules[repository_name] = {} for file in os.listdir(os.path.join(schedules_dir, repository_name)): if not file.endswith('.json'): continue file_path = os.path.join(schedules_dir, repository_name, file) with open(file_path) as data: try: schedule = deserialize_json_to_dagster_namedtuple( data.read()) self._schedules[repository_name][ schedule.name] = schedule except Exception as ex: # pylint: disable=broad-except warnings.warn( 'Could not parse dagster schedule from {file_name} in {dir_name}. ' '{ex}: {msg}'.format( file_name=file, dir_name=self._base_dir, ex=type(ex).__name__, msg=ex, )) continue
def sftp_solid(context): ''' Ported from Airflow's SFTPOperator. sftp_solid: for transferring files from remote host to local or vice a versa. This solid uses ssh_resource to open a SFTP transport channel that serve as basis for file transfer. ''' local_filepath = context.solid_config.get('local_filepath') remote_filepath = context.solid_config.get('remote_filepath') operation = context.solid_config.get('operation') confirm = context.solid_config.get('confirm') with context.resources.ssh_resource.get_connection() as ssh_client: sftp_client = ssh_client.open_sftp() if operation == 'GET': local_folder = os.path.dirname(local_filepath) # Create intermediate directories if they don't exist mkdir_p(local_folder) context.log.info('Starting to transfer from {0} to {1}'.format( remote_filepath, local_filepath)) sftp_client.get(remote_filepath, local_filepath) else: context.log.info( 'Starting to transfer file from {0} to {1}'.format( local_filepath, remote_filepath)) sftp_client.put(local_filepath, remote_filepath, confirm=confirm) return local_filepath
def ensure_base_dir_exists(self): if self._base_dir_ensured: return mkdir_p(self.base_dir) self._base_dir_ensured = True
def events_jar(): git_repo_root = six.ensure_str( subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()) temp_dir = os.path.join(get_system_temp_directory(), 'dagster_examples_tests', 'event_pipeline_demo_tests') mkdir_p(temp_dir) dst = os.path.join(temp_dir, 'events.jar') if os.path.exists(dst): print('events jar already exists, skipping') # pylint: disable=print-call else: subprocess.check_call(['sbt', 'events/assembly'], cwd=os.path.join(git_repo_root, 'scala_modules')) src = os.path.join( git_repo_root, 'scala_modules', 'events/target/scala-2.11/events-assembly-0.1.0-SNAPSHOT.jar', ) subprocess.check_call(['cp', src, dst]) yield dst
def fs_file_cache(init_context): target_folder = init_context.resource_config["target_folder"] if not os.path.exists(target_folder): mkdir_p(target_folder) return FSFileCache(target_folder=target_folder, overwrite=False)
def get_config_dir(config_yaml=None): instance = DagsterInstance.get() config_module_name = "dagster_celery_config" config_dir = os.path.join(instance.root_directory, "dagster_celery", "config", str(uuid.uuid4())) mkdir_p(config_dir) config_path = os.path.join( config_dir, "{config_module_name}.py".format( config_module_name=config_module_name)) validated_config = get_validated_config(config_yaml) with open(config_path, "w") as fd: if "broker" in validated_config and validated_config["broker"]: fd.write("broker_url = '{broker_url}'\n".format( broker_url=str(validated_config["broker"]))) if "backend" in validated_config and validated_config["backend"]: fd.write("result_backend = '{result_backend}'\n".format( result_backend=str(validated_config["backend"]))) if "config_source" in validated_config and validated_config[ "config_source"]: for key, value in validated_config["config_source"].items(): fd.write("{key} = {value}\n".format(key=key, value=repr(value))) # n.b. right now we don't attempt to clean up this cache, but it might make sense to delete # any files older than some time if there are more than some number of files present, etc. return config_dir
def events_jar(): git_repo_root = six.ensure_str( subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip()) temp_dir = os.path.join(get_system_temp_directory(), "dagster_examples_tests", "event_pipeline_demo_tests") mkdir_p(temp_dir) dst = os.path.join(temp_dir, "events.jar") if os.path.exists(dst): print("events jar already exists, skipping") # pylint: disable=print-call else: subprocess.check_call(["sbt", "events/assembly"], cwd=os.path.join(git_repo_root, "scala_modules")) src = os.path.join( git_repo_root, "scala_modules", "events/target/scala-2.11/events-assembly-0.1.0-SNAPSHOT.jar", ) subprocess.check_call(["cp", src, dst]) yield dst
def _download_from_s3_to_file(session, context, bucket, key, target_folder, skip_if_present): # TODO: remove context argument once we support resource logging # file name is S3 key path suffix after last / target_file = os.path.join(target_folder, key.split('/')[-1]) if skip_if_present and safe_isfile(target_file): context.log.info( 'Skipping download, file already present at {target_file}'.format( target_file=target_file ) ) else: if not os.path.exists(target_folder): mkdir_p(target_folder) context.log.info( 'Starting download of {bucket}/{key} to {target_file}'.format( bucket=bucket, key=key, target_file=target_file ) ) headers = session.head_object(Bucket=bucket, Key=key) logger = S3Logger( context.log.debug, bucket, key, target_file, int(headers['ContentLength']) ) session.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger) return target_file
def from_local(cls, base_dir, inst_data=None): check.str_param(base_dir, "base_dir") mkdir_p(base_dir) conn_string = create_db_conn_string(base_dir, "runs") engine = create_engine(conn_string, poolclass=NullPool) alembic_config = get_alembic_config(__file__) should_mark_indexes = False with engine.connect() as connection: db_revision, head_revision = check_alembic_revision( alembic_config, connection) if not (db_revision and head_revision): RunStorageSqlMetadata.create_all(engine) engine.execute("PRAGMA journal_mode=WAL;") stamp_alembic_rev(alembic_config, connection) should_mark_indexes = True table_names = db.inspect(engine).get_table_names() if "instance_info" not in table_names: InstanceInfo.create(engine) run_storage = cls(conn_string, inst_data) if should_mark_indexes: run_storage.migrate() run_storage.optimize() return run_storage
def get_papermill_parameters(compute_context, inputs, output_log_path): check.inst_param(compute_context, 'compute_context', SystemComputeExecutionContext) check.param_invariant( isinstance(compute_context.environment_dict, dict), 'compute_context', 'SystemComputeExecutionContext must have valid environment_dict', ) check.dict_param(inputs, 'inputs', key_type=six.string_types) run_id = compute_context.run_id marshal_dir = '/tmp/dagstermill/{run_id}/marshal'.format(run_id=run_id) mkdir_p(marshal_dir) (handle, solid_subset) = ExecutionTargetHandle.get_handle( compute_context.pipeline_def) if not handle: raise DagstermillError( 'Can\'t execute a dagstermill solid from a pipeline that wasn\'t instantiated using ' 'an ExecutionTargetHandle') dm_handle_kwargs = handle.data._asdict() dm_handle_kwargs['pipeline_name'] = compute_context.pipeline_def.name dm_context_dict = { 'output_log_path': output_log_path, 'marshal_dir': marshal_dir, 'environment_dict': compute_context.environment_dict, } dm_solid_handle_kwargs = compute_context.solid_handle._asdict() parameters = {} input_def_dict = compute_context.solid_def.input_dict for input_name, input_value in inputs.items(): assert ( input_name not in RESERVED_INPUT_NAMES ), 'Dagstermill solids cannot have inputs named {input_name}'.format( input_name=input_name) dagster_type = input_def_dict[input_name].dagster_type parameter_value = write_value( dagster_type, input_value, os.path.join(marshal_dir, 'input-{}'.format(input_name))) parameters[input_name] = parameter_value parameters['__dm_context'] = dm_context_dict parameters['__dm_handle_kwargs'] = dm_handle_kwargs parameters['__dm_pipeline_run_dict'] = pack_value( compute_context.pipeline_run) parameters['__dm_solid_handle_kwargs'] = dm_solid_handle_kwargs parameters['__dm_solid_subset'] = solid_subset parameters['__dm_instance_ref_dict'] = pack_value( compute_context.instance.get_ref()) return parameters
def __init__(self, base_dir=None): self._base_dir = check.opt_str_param(base_dir, 'base_dir', base_runs_directory()) mkdir_p(self._base_dir) self.file_cursors = defaultdict(lambda: (0, 0)) # Swap these out to use lockfiles self.file_lock = defaultdict(gevent.lock.Semaphore) self._metadata_file_lock = defaultdict(gevent.lock.Semaphore)
def set_intermediate_object(cls, intermediate_storage, context, dagster_type, step_output_handle, value): paths = [ 'intermediates', step_output_handle.step_key, step_output_handle.output_name ] paths.append(value) mkdir_p(os.path.join(intermediate_storage.root, *paths))
def __init__(self, base_dir): self._base_dir = check.str_param(base_dir, 'base_dir') mkdir_p(self._base_dir) self._known_run_ids = set([]) self._watchers = {} self._obs = Observer() self._obs.start()
def _get_bash_script_file_path(self, instance, repository, schedule): check.inst_param(instance, 'instance', DagsterInstance) script_directory = os.path.join(instance.schedules_directory(), "scripts") utils.mkdir_p(script_directory) script_file_name = "{}.{}.sh".format(repository.name, schedule.name) return os.path.join(script_directory, script_file_name)
def __init__(self, bucket_name, volume): self.bucket_name = check.str_param(bucket_name, "bucket_name") # Setup bucket self.volume = os.path.join(tempfile.gettempdir(), check.str_param(volume, "volume")) bucket_location = os.path.join(self.volume, self.bucket_name) if not os.path.exists(bucket_location): mkdir_p(bucket_location) self.location = bucket_location self.blobs = {}
def __init__(self, base_dir): self._base_dir = check.str_param(base_dir, 'base_dir') mkdir_p(self._base_dir) self.file_cursors = defaultdict(lambda: (0, 0)) # Swap these out to use lockfiles self.file_lock = defaultdict(gevent.lock.Semaphore) self._watchers = {} self._obs = Observer() self._obs.start()
def _get_bash_script_file_path(self, instance, schedule_origin_id): check.inst_param(instance, "instance", DagsterInstance) check.str_param(schedule_origin_id, "schedule_origin_id") script_directory = os.path.join(instance.schedules_directory(), "scripts") utils.mkdir_p(script_directory) script_file_name = "{}.sh".format(schedule_origin_id) return os.path.join(script_directory, script_file_name)
def write_dagster_run_meta(self, dagster_run_meta): check.inst_param(dagster_run_meta, 'dagster_run_meta', DagsterRunMeta) run_dir = os.path.join(self._base_dir, dagster_run_meta.run_id) mkdir_p(run_dir) with open(self._meta_file, 'a+') as ff: ff.write(seven.json.dumps(dagster_run_meta._asdict()) + '\n')
def _get_or_create_logs_directory(self, instance, schedule_origin_id): check.inst_param(instance, "instance", DagsterInstance) check.str_param(schedule_origin_id, "schedule_origin_id") logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id) if not os.path.isdir(logs_directory): utils.mkdir_p(logs_directory) return logs_directory
def get_papermill_parameters(step_context, inputs, output_log_path): check.inst_param(step_context, "step_context", StepExecutionContext) check.param_invariant( isinstance(step_context.run_config, dict), "step_context", "StepExecutionContext must have valid run_config", ) check.dict_param(inputs, "inputs", key_type=str) run_id = step_context.run_id temp_dir = get_system_temp_directory() marshal_dir = os.path.normpath( os.path.join(temp_dir, "dagstermill", str(run_id), "marshal")) mkdir_p(marshal_dir) if not isinstance(step_context.pipeline, ReconstructablePipeline): raise DagstermillError( "Can't execute a dagstermill solid from a pipeline that is not reconstructable. " "Use the reconstructable() function if executing from python") dm_executable_dict = step_context.pipeline.to_dict() dm_context_dict = { "output_log_path": output_log_path, "marshal_dir": marshal_dir, "run_config": step_context.run_config, } dm_solid_handle_kwargs = step_context.solid_handle._asdict() parameters = {} input_def_dict = step_context.solid_def.input_dict for input_name, input_value in inputs.items(): assert ( input_name not in RESERVED_INPUT_NAMES ), "Dagstermill solids cannot have inputs named {input_name}".format( input_name=input_name) dagster_type = input_def_dict[input_name].dagster_type parameter_value = write_value( dagster_type, input_value, os.path.join( marshal_dir, f"{str(step_context.solid_handle)}-input-{input_name}"), ) parameters[input_name] = parameter_value parameters["__dm_context"] = dm_context_dict parameters["__dm_executable_dict"] = dm_executable_dict parameters["__dm_pipeline_run_dict"] = pack_value( step_context.pipeline_run) parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs parameters["__dm_instance_ref_dict"] = pack_value( step_context.instance.get_ref()) return parameters
def temp_dir(): '''Context manager for temporary directories. pytest implicitly wraps in try/except. ''' dir_path = os.path.join('/tmp', str(uuid.uuid4())) mkdir_p(dir_path) yield dir_path shutil.rmtree(dir_path)