def __new__(cls, reg_set, frozen_set): set_param(reg_set, "reg_set") inst_param(frozen_set, "frozen_set", frozenset) return super(HasSets, cls).__new__(cls, reg_set, frozen_set)
def _get_pipeline_subset_def(pipeline_def, solids_to_execute): """ Build a pipeline which is a subset of another pipeline. Only includes the solids which are in solids_to_execute. """ check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) check.set_param(solids_to_execute, "solids_to_execute", of_type=str) for solid_name in solids_to_execute: if not pipeline_def.has_solid_named(solid_name): raise DagsterInvalidSubsetError( "Pipeline {pipeline_name} has no solid named {name}.".format( pipeline_name=pipeline_def.name, name=solid_name), ) solids = list(map(pipeline_def.solid_named, solids_to_execute)) deps = {_dep_key_of(solid): {} for solid in solids} for solid in solids: for input_handle in solid.input_handles(): if pipeline_def.dependency_structure.has_singular_dep( input_handle): output_handle = pipeline_def.dependency_structure.get_singular_dep( input_handle) if output_handle.solid.name in solids_to_execute: deps[_dep_key_of(solid)][ input_handle.input_def.name] = DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name) elif pipeline_def.dependency_structure.has_multi_deps( input_handle): output_handles = pipeline_def.dependency_structure.get_multi_deps( input_handle) deps[_dep_key_of(solid)][ input_handle.input_def.name] = MultiDependencyDefinition([ DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name) for output_handle in output_handles if output_handle.solid.name in solids_to_execute ]) try: sub_pipeline_def = PipelineSubsetDefinition( name=pipeline_def. name, # should we change the name for subsetted pipeline? solid_defs=list({solid.definition for solid in solids}), mode_defs=pipeline_def.mode_definitions, dependencies=deps, _parent_pipeline_def=pipeline_def, tags=pipeline_def.tags, hook_defs=pipeline_def.hook_defs, ) return sub_pipeline_def except DagsterInvalidDefinitionError as exc: # This handles the case when you construct a subset such that an unsatisfied # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError, # we re-raise a DagsterInvalidSubsetError. raise DagsterInvalidSubsetError( f"The attempted subset {str_format_set(solids_to_execute)} for pipeline " f"{pipeline_def.name} results in an invalid pipeline") from exc
def _validate_resource_dependencies(mode_definitions, node_defs, dagster_type_dict, solid_dict, pipeline_hook_defs): """This validation ensures that each pipeline context provides the resources that are required by each solid. """ check.list_param(mode_definitions, "mode_definitions", of_type=ModeDefinition) check.list_param(node_defs, "node_defs", of_type=NodeDefinition) check.dict_param(dagster_type_dict, "dagster_type_dict") check.dict_param(solid_dict, "solid_dict") check.set_param(pipeline_hook_defs, "pipeline_hook_defs", of_type=HookDefinition) for mode_def in mode_definitions: mode_resources = set(mode_def.resource_defs.keys()) for node_def in node_defs: for required_resource in node_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( 'Resource "{resource}" is required by solid def {node_def_name}, but is not ' 'provided by mode "{mode_name}".').format( resource=required_resource, node_def_name=node_def.name, mode_name=mode_def.name, )) _validate_type_resource_deps_for_mode(mode_def, mode_resources, dagster_type_dict) for intermediate_storage in mode_def.intermediate_storage_defs or []: for required_resource in intermediate_storage.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( "Resource '{resource}' is required by intermediate storage " "'{storage_name}', but is not provided by mode '{mode_name}'." ).format( resource=required_resource, storage_name=intermediate_storage.name, mode_name=mode_def.name, )) for solid in solid_dict.values(): for hook_def in solid.hook_defs: for required_resource in hook_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( 'Resource "{resource}" is required by hook "{hook_name}", but is not ' 'provided by mode "{mode_name}".').format( resource=required_resource, hook_name=hook_def.name, mode_name=mode_def.name, )) for hook_def in pipeline_hook_defs: for required_resource in hook_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( 'Resource "{resource}" is required by hook "{hook_name}", but is not ' 'provided by mode "{mode_name}".').format( resource=required_resource, hook_name=hook_def.name, mode_name=mode_def.name, ))
def subset_for_execution_from_existing_pipeline(self, solids_to_execute): # take a frozenset of resolved solid names from an existing pipeline run # so there's no need to parse the selection check.set_param(solids_to_execute, "solids_to_execute", of_type=str) return self._subset_for_execution(solids_to_execute)
def resource_initialization_event_generator( execution_plan, environment_config, pipeline_run, log_manager, resource_keys_to_init ): check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.inst_param(environment_config, 'environment_config', EnvironmentConfig) check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) check.inst_param(log_manager, 'log_manager', DagsterLogManager) check.set_param(resource_keys_to_init, 'resource_keys_to_init', of_type=str) if execution_plan.step_key_for_single_step_plans(): step = execution_plan.get_step_by_key(execution_plan.step_key_for_single_step_plans()) resource_log_manager = DagsterLogManager( pipeline_run.run_id, merge_dicts(log_manager.logging_tags, step.logging_tags), log_manager.loggers, ) else: resource_log_manager = log_manager resource_instances = {} pipeline_def = execution_plan.pipeline_def mode_definition = pipeline_def.get_mode_definition(pipeline_run.mode) resource_managers = deque() generator_closed = False resource_init_times = {} try: if resource_keys_to_init: yield DagsterEvent.resource_init_start( execution_plan, resource_log_manager, resource_keys_to_init, ) for resource_name, resource_def in sorted(mode_definition.resource_defs.items()): if not resource_name in resource_keys_to_init: continue resource_context = InitResourceContext( pipeline_def=pipeline_def, resource_def=resource_def, resource_config=environment_config.resources.get(resource_name, {}).get('config'), run_id=pipeline_run.run_id, log_manager=resource_log_manager, ) manager = single_resource_generation_manager( resource_context, resource_name, resource_def ) for event in manager.generate_setup_events(): if event: yield event initialized_resource = check.inst(manager.get_object(), InitializedResource) resource_instances[resource_name] = initialized_resource.resource resource_init_times[resource_name] = initialized_resource.duration resource_managers.append(manager) if resource_keys_to_init: yield DagsterEvent.resource_init_success( execution_plan, resource_log_manager, resource_instances, resource_init_times ) yield ScopedResourcesBuilder(resource_instances) except GeneratorExit: # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/). generator_closed = True raise except DagsterUserCodeExecutionError as dagster_user_error: yield DagsterEvent.resource_init_failure( execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info(dagster_user_error.original_exc_info), ) raise dagster_user_error finally: if not generator_closed: error = None while len(resource_managers) > 0: manager = resource_managers.pop() try: for event in manager.generate_teardown_events(): yield event except DagsterUserCodeExecutionError as dagster_user_error: error = dagster_user_error if error: yield DagsterEvent.resource_teardown_failure( execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info(error.original_exc_info), )
def create_databricks_job_solid( name="databricks_job", num_inputs=1, description=None, required_resource_keys=frozenset(["databricks_client"]), ): """ Creates a solid that launches a databricks job. As config, the solid accepts a blob of the form described in Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html. Returns: SolidDefinition: A solid definition. """ check.str_param(name, "name") check.opt_str_param(description, "description") check.int_param(num_inputs, "num_inputs") check.set_param(required_resource_keys, "required_resource_keys", of_type=str) input_defs = [ InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs) ] @solid( name=name, description=description, config_schema={ "job": Field( Permissive(), description= "Databricks job run configuration, in the form described in " "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html", ), "poll_interval_sec": Field( float, description="Check whether the job is done at this interval.", default_value=10, ), "max_wait_time_sec": Field( float, description= "If the job is not complete after this length of time, raise an error.", default_value=(24 * 60 * 60), ), }, input_defs=input_defs, output_defs=[OutputDefinition(Nothing)], required_resource_keys=required_resource_keys, tags={"kind": "databricks"}, ) def databricks_solid(context): job_config = context.solid_config["job"] databricks_client = context.resources.databricks_client run_id = databricks_client.submit_run(**job_config) context.log.info( "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion..." .format(run_id=run_id, url=create_ui_url(databricks_client, context.solid_config))) wait_for_run_to_complete( databricks_client, context.log, run_id, context.solid_config["poll_interval_sec"], context.solid_config["max_wait_time_sec"], ) return databricks_solid
def test_set_param(): assert check.set_param(set(), 'set_param') == set() assert check.set_param(frozenset(), 'set_param') == set() with pytest.raises(ParameterCheckError): check.set_param(None, 'set_param') with pytest.raises(ParameterCheckError): check.set_param('3u4', 'set_param') obj_set = {1} assert check.set_param(obj_set, 'set_param') == obj_set obj_set_two = {1, 1, 2} obj_set_two_deduped = {1, 2} assert check.set_param(obj_set_two, 'set_param') == obj_set_two_deduped assert check.set_param(obj_set_two, 'set_param', of_type=int) == obj_set_two_deduped with pytest.raises(CheckError, match='Did you pass a class'): check.set_param({str}, 'set_param', of_type=int) with pytest.raises(CheckError, match='Member of set mismatches type'): check.set_param({'foo'}, 'set_param', of_type=int)
def with_hooks(self, hook_defs): hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition) return CallableSolidNode(self.solid_def, self.given_alias, self.tags, hook_defs.union(self.hook_defs))
def test_set_param(): assert check.set_param(set(), "set_param") == set() assert check.set_param(frozenset(), "set_param") == set() with pytest.raises(ParameterCheckError): check.set_param(None, "set_param") with pytest.raises(ParameterCheckError): check.set_param("3u4", "set_param") obj_set = {1} assert check.set_param(obj_set, "set_param") == obj_set obj_set_two = {1, 1, 2} obj_set_two_deduped = {1, 2} assert check.set_param(obj_set_two, "set_param") == obj_set_two_deduped assert check.set_param(obj_set_two, "set_param", of_type=int) == obj_set_two_deduped with pytest.raises(CheckError, match="Did you pass a class"): check.set_param({str}, "set_param", of_type=int) with pytest.raises(CheckError, match="Member of set mismatches type"): check.set_param({"foo"}, "set_param", of_type=int)
def _validate_resource_dependencies(mode_definitions, solid_defs, solid_dict, pipeline_hook_defs): """This validation ensures that each pipeline context provides the resources that are required by each solid. """ check.list_param(mode_definitions, "mode_definitions", of_type=ModeDefinition) check.list_param(solid_defs, "solid_defs", of_type=ISolidDefinition) check.set_param(pipeline_hook_defs, "pipeline_hook_defs", of_type=HookDefinition) for mode_def in mode_definitions: mode_resources = set(mode_def.resource_defs.keys()) for solid_def in solid_defs: for required_resource in solid_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError( ( 'Resource "{resource}" is required by solid def {solid_def_name}, but is not ' 'provided by mode "{mode_name}".' ).format( resource=required_resource, solid_def_name=solid_def.name, mode_name=mode_def.name, ) ) for system_storage_def in mode_def.system_storage_defs: for required_resource in system_storage_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError( ( "Resource '{resource}' is required by system storage " "'{storage_name}', but is not provided by mode '{mode_name}'." ).format( resource=required_resource, storage_name=system_storage_def.name, mode_name=mode_def.name, ) ) for solid in solid_dict.values(): for hook_def in solid.hook_defs: for required_resource in hook_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError( ( 'Resource "{resource}" is required by hook "{hook_name}", but is not ' 'provided by mode "{mode_name}".' ).format( resource=required_resource, hook_name=hook_def.name, mode_name=mode_def.name, ) ) for hook_def in pipeline_hook_defs: for required_resource in hook_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError( ( 'Resource "{resource}" is required by hook "{hook_name}", but is not ' 'provided by mode "{mode_name}".' ).format( resource=required_resource, hook_name=hook_def.name, mode_name=mode_def.name, ) )
def with_hooks(self, hook_defs): from .composition import CallableSolidNode hook_defs = frozenset(check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)) return CallableSolidNode(self, hook_defs=hook_defs)
def __new__(cls, reg_set, frozen_set): set_param(reg_set, 'reg_set') inst_param(frozen_set, 'frozen_set', frozenset) return super(HasSets, cls).__new__(cls, reg_set, frozen_set)