def get_latest_job_tick(self, job_origin_id): check.str_param(job_origin_id, "job_origin_id") query = (db.select( [JobTickTable.c.id, JobTickTable.c.tick_body]).select_from(JobTickTable).where( JobTickTable.c.job_origin_id == job_origin_id).order_by( JobTickTable.c.timestamp.desc()).limit(1)) rows = self.execute(query) if len(rows) == 0: return None return JobTick(rows[0][0], deserialize_json_to_dagster_namedtuple(rows[0][1]))
def cancel_execution(self, cancel_execution_request): check.inst_param( cancel_execution_request, "cancel_execution_request", CancelExecutionRequest, ) res = self._query( "CancelExecution", api_pb2.CancelExecutionRequest, serialized_cancel_execution_request=serialize_dagster_namedtuple( cancel_execution_request), ) return deserialize_json_to_dagster_namedtuple( res.serialized_cancel_execution_result)
def external_repository(self, repository_grpc_server_origin): check.inst_param( repository_grpc_server_origin, 'repository_grpc_server_origin', RepositoryGrpcServerOrigin, ) res = self._query( 'ExternalRepository', api_pb2.ExternalRepositoryRequest, serialized_repository_python_origin=serialize_dagster_namedtuple( repository_grpc_server_origin ), ) return deserialize_json_to_dagster_namedtuple(res.serialized_external_repository_data)
def ExternalExecutableParams(self, request, _context): external_executable_args = deserialize_json_to_dagster_namedtuple( request.serialized_external_executable_args) check.inst_param( external_executable_args, "external_executable_args", ExternalExecutableArgs, ) recon_repo = self._recon_repository_from_origin( external_executable_args.repository_origin) return api_pb2.ExternalExecutableParamsReply( serialized_external_execution_params_or_external_execution_params_error_data =serialize_dagster_namedtuple( get_external_executable_params(recon_repo, external_executable_args)))
def get_run_by_id(self, run_id): """Get a run by its id. Args: run_id (str): The id of the run Returns: Optional[PipelineRun] """ check.str_param(run_id, "run_id") query = db.select([RunsTable.c.run_body ]).where(RunsTable.c.run_id == run_id) rows = self.fetchall(query) return deserialize_json_to_dagster_namedtuple( rows[0][0]) if len(rows) else None
def get_schedule_ticks_by_schedule(self, repository, schedule_name): check.inst_param(repository, 'repository', RepositoryDefinition) check.str_param(schedule_name, 'schedule_name') query = (db.select([ ScheduleTickTable.c.id, ScheduleTickTable.c.tick_body ]).select_from(ScheduleTickTable).where( ScheduleTickTable.c.repository_name == repository.name).where( ScheduleTickTable.c.schedule_name == schedule_name).order_by( ScheduleTickTable.c.id.desc())) rows = self.execute(query) return list( map( lambda r: ScheduleTick( r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows))
def external_executable_params(self, external_executable_args): check.inst_param( external_executable_args, "external_executable_args", ExternalExecutableArgs, ) res = self._query( "ExternalExecutableParams", api_pb2.ExternalExecutableParamsRequest, serialized_external_executable_args=serialize_dagster_namedtuple( external_executable_args ), ) return deserialize_json_to_dagster_namedtuple( res.serialized_external_execution_params_or_external_execution_params_error_data )
def external_pipeline_subset(self, pipeline_subset_snapshot_args): check.inst_param( pipeline_subset_snapshot_args, "pipeline_subset_snapshot_args", PipelineSubsetSnapshotArgs, ) res = self._query( "ExternalPipelineSubsetSnapshot", api_pb2.ExternalPipelineSubsetSnapshotRequest, serialized_pipeline_subset_snapshot_args= serialize_dagster_namedtuple(pipeline_subset_snapshot_args), ) return deserialize_json_to_dagster_namedtuple( res.serialized_external_pipeline_subset_result)
def get_logs_for_run_by_log_id( self, run_id, cursor=-1, dagster_event_type=None, limit=None, ): check.str_param(run_id, "run_id") check.int_param(cursor, "cursor") check.invariant( cursor >= -1, "Don't know what to do with negative cursor {cursor}".format(cursor=cursor), ) check.opt_inst_param(dagster_event_type, "dagster_event_type", DagsterEventType) query = ( db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]) .where(SqlEventLogStorageTable.c.run_id == run_id) .order_by(SqlEventLogStorageTable.c.id.asc()) ) if dagster_event_type: query = query.where( SqlEventLogStorageTable.c.dagster_event_type == dagster_event_type.value ) # adjust 0 based index cursor to SQL offset query = query.offset(cursor + 1) if limit: query = query.limit(limit) with self.run_connection(run_id) as conn: results = conn.execute(query).fetchall() events = {} try: for ( record_id, json_str, ) in results: events[record_id] = check.inst_param( deserialize_json_to_dagster_namedtuple(json_str), "event", EventLogEntry ) except (seven.JSONDecodeError, DeserializationError) as err: raise DagsterEventLogInvalidForRun(run_id=run_id) from err return events
def sync_get_external_execution_plan_grpc( api_client, pipeline_origin, run_config, mode, pipeline_snapshot_id, solid_selection=None, step_keys_to_execute=None, known_state=None, instance=None, ): from dagster.grpc.client import DagsterGrpcClient check.inst_param(api_client, "api_client", DagsterGrpcClient) check.inst_param(pipeline_origin, "pipeline_origin", ExternalPipelineOrigin) check.opt_list_param(solid_selection, "solid_selection", of_type=str) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str) check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id") check.opt_inst_param(known_state, "known_state", KnownExecutionState) check.opt_inst_param(instance, "instance", DagsterInstance) result = check.inst( deserialize_json_to_dagster_namedtuple( api_client.execution_plan_snapshot( execution_plan_snapshot_args=ExecutionPlanSnapshotArgs( pipeline_origin=pipeline_origin, solid_selection=solid_selection, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, pipeline_snapshot_id=pipeline_snapshot_id, known_state=known_state, instance_ref=instance.get_ref() if instance and instance.is_persistent else None, ) ), ), (ExecutionPlanSnapshot, ExecutionPlanSnapshotErrorData), ) if isinstance(result, ExecutionPlanSnapshotErrorData): raise DagsterUserCodeProcessError.from_error_info(result.error) return result
def watcher_thread( conn_string: str, handlers_dict: MutableMapping[str, List[CallbackAfterCursor]], dict_lock: threading.Lock, watcher_thread_exit: threading.Event, watcher_thread_started: threading.Event, ): for notif in await_pg_notifications( conn_string, channels=[CHANNEL_NAME], timeout=POLLING_CADENCE, yield_on_timeout=True, exit_event=watcher_thread_exit, started_event=watcher_thread_started, ): if notif is None: if watcher_thread_exit.is_set(): break else: run_id, index_str = notif.payload.split("_") with dict_lock: if run_id not in handlers_dict: continue index = int(index_str) with dict_lock: handlers = handlers_dict.get(run_id, []) engine = create_engine(conn_string, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool) try: with engine.connect() as conn: cursor_res = conn.execute( db.select([ SqlEventLogStorageTable.c.event ]).where(SqlEventLogStorageTable.c.id == index), ) dagster_event: EventLogEntry = deserialize_json_to_dagster_namedtuple( cursor_res.scalar()) finally: engine.dispose() with dict_lock: for callback_with_cursor in handlers: if callback_with_cursor.start_cursor < index: callback_with_cursor.callback(dagster_event)
def external_schedule_execution(self, external_schedule_execution_args): check.inst_param( external_schedule_execution_args, "external_schedule_execution_args", ExternalScheduleExecutionArgs, ) chunks = list( self._streaming_query( "ExternalScheduleExecution", api_pb2.ExternalScheduleExecutionRequest, serialized_external_schedule_execution_args= serialize_dagster_namedtuple(external_schedule_execution_args), )) return deserialize_json_to_dagster_namedtuple("".join( [chunk.serialized_chunk for chunk in chunks]))
def get_latest_tick(self, schedule_origin_id): check.str_param(schedule_origin_id, "schedule_origin_id") query = ( db.select([ScheduleTickTable.c.id, ScheduleTickTable.c.tick_body]) .select_from(ScheduleTickTable) .where(ScheduleTickTable.c.schedule_origin_id == schedule_origin_id) .order_by(ScheduleTickTable.c.timestamp.desc()) .limit(1) ) rows = self.execute(query) if len(rows) == 0: return None return ScheduleTick(rows[0][0], deserialize_json_to_dagster_namedtuple(rows[0][1]))
def ExternalPartitionNames(self, request, _context): partition_names_args = deserialize_json_to_dagster_namedtuple( request.serialized_partition_names_args) check.inst_param(partition_names_args, "partition_names_args", PartitionNamesArgs) recon_repo = self._recon_repository_from_origin( partition_names_args.repository_origin) return api_pb2.ExternalPartitionNamesReply( serialized_external_partition_names_or_external_partition_execution_error =serialize_dagster_namedtuple( get_partition_names( recon_repo, partition_names_args.partition_set_name, )))
def test_simple_pipeline_smoke_test(): @solid def solid_without_config(_): pass @pipeline def single_solid_pipeline(): solid_without_config() config_schema_snapshot = build_config_schema_snapshot( single_solid_pipeline) assert config_schema_snapshot.all_config_snaps_by_key serialized = serialize_dagster_namedtuple(config_schema_snapshot) rehydrated_config_schema_snapshot = deserialize_json_to_dagster_namedtuple( serialized) assert config_schema_snapshot == rehydrated_config_schema_snapshot
def external_repository(self, external_repository_origin): check.inst_param( external_repository_origin, "external_repository_origin", ExternalRepositoryOrigin, ) res = self._query( "ExternalRepository", api_pb2.ExternalRepositoryRequest, # rename this param name serialized_repository_python_origin=serialize_dagster_namedtuple( external_repository_origin), ) return deserialize_json_to_dagster_namedtuple( res.serialized_external_repository_data)
def execute_step_with_structured_logs_command(input_json): try: signal.signal(signal.SIGTERM, signal.getsignal(signal.SIGINT)) except ValueError: warnings.warn(( "Unexpected error attempting to manage signal handling on thread {thread_name}. " "You should not invoke this API (execute_step_with_structured_logs) from threads " "other than the main thread.").format( thread_name=threading.current_thread().name)) args = check.inst(deserialize_json_to_dagster_namedtuple(input_json), ExecuteStepArgs) with (DagsterInstance.from_ref(args.instance_ref) if args.instance_ref else DagsterInstance.get()) as instance: pipeline_run = instance.get_run_by_id(args.pipeline_run_id) recon_pipeline = recon_pipeline_from_origin(args.pipeline_origin) retries = Retries.from_config(args.retries_dict) if args.should_verify_step: success = verify_step(instance, pipeline_run, retries, args.step_keys_to_execute) if not success: return execution_plan = create_execution_plan( recon_pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute), run_config=args.run_config, step_keys_to_execute=args.step_keys_to_execute, mode=args.mode, ) buff = [] for event in execute_plan_iterator( execution_plan, pipeline_run, instance, run_config=args.run_config, retries=retries, ): buff.append(serialize_dagster_namedtuple(event)) for line in buff: click.echo(line)
def watcher_thread( conn_string: str, engine: db.engine.Engine, handlers_dict: MutableMapping[str, List[CallbackAfterCursor]], dict_lock: threading.Lock, watcher_thread_exit: threading.Event, watcher_thread_started: threading.Event, ): for notif in await_pg_notifications( conn_string, channels=[CHANNEL_NAME], timeout=POLLING_CADENCE, yield_on_timeout=True, exit_event=watcher_thread_exit, started_event=watcher_thread_started, ): if notif is None: if watcher_thread_exit.is_set(): break else: run_id, index_str = notif.payload.split("_") with dict_lock: if run_id not in handlers_dict: continue index = int(index_str) with dict_lock: handlers = handlers_dict.get(run_id, []) with engine.connect() as conn: cursor_res = conn.execute( db.select([ SqlEventLogStorageTable.c.event ]).where(SqlEventLogStorageTable.c.id == index), ) dagster_event: EventLogEntry = deserialize_json_to_dagster_namedtuple( cursor_res.scalar()) for callback_with_cursor in handlers: if callback_with_cursor.start_cursor < index: try: callback_with_cursor.callback(dagster_event) except Exception: logging.exception( "Exception in callback for event watch on run %s.", run_id)
def ExternalSensorExecution(self, request, _context): args = deserialize_json_to_dagster_namedtuple( request.serialized_external_sensor_execution_args) check.inst_param(args, "args", SensorExecutionArgs) recon_repo = self._recon_repository_from_origin(args.repository_origin) return api_pb2.ExternalSensorExecutionReply( serialized_external_sensor_execution_data_or_external_sensor_execution_error =serialize_dagster_namedtuple( get_external_sensor_execution( recon_repo, args.instance_ref, args.sensor_name, args.last_completion_time, args.last_run_key, )))
def ExternalSensorExecution(self, request, _context): args = deserialize_json_to_dagster_namedtuple( request.serialized_external_sensor_execution_args) check.inst_param(args, "args", SensorExecutionArgs) recon_repo = self._recon_repository_from_origin(args.repository_origin) serialized_sensor_data = serialize_dagster_namedtuple( get_external_sensor_execution( recon_repo, args.instance_ref, args.sensor_name, args.last_completion_time, args.last_run_key, )) yield from self._split_serialized_data_into_chunk_events( serialized_sensor_data)
def execute_run_with_structured_logs_command(input_json): signal.signal(signal.SIGTERM, signal.getsignal(signal.SIGINT)) args = check.inst(deserialize_json_to_dagster_namedtuple(input_json), ExecuteRunArgs) recon_pipeline = recon_pipeline_from_origin(args.pipeline_origin) with ( DagsterInstance.from_ref(args.instance_ref) if args.instance_ref else DagsterInstance.get() ) as instance: buffer = [] def send_to_buffer(event): buffer.append(serialize_dagster_namedtuple(event)) _execute_run_command_body(recon_pipeline, args.pipeline_run_id, instance, send_to_buffer) for line in buffer: click.echo(line)
def ExternalScheduleExecution(self, request, _context): external_schedule_execution_args = deserialize_json_to_dagster_namedtuple( request.serialized_external_schedule_execution_args) check.inst_param( external_schedule_execution_args, "external_schedule_execution_args", ExternalScheduleExecutionArgs, ) recon_repo = self._recon_repository_from_origin( external_schedule_execution_args.repository_origin) return api_pb2.ExternalScheduleExecutionReply( serialized_external_schedule_execution_data_or_external_schedule_execution_error =serialize_dagster_namedtuple( get_external_schedule_execution( recon_repo, external_schedule_execution_args)))
def ExternalPipelineSubsetSnapshot(self, request, _context): pipeline_subset_snapshot_args = deserialize_json_to_dagster_namedtuple( request.serialized_pipeline_subset_snapshot_args) check.inst_param( pipeline_subset_snapshot_args, 'pipeline_subset_snapshot_args', PipelineSubsetSnapshotArgs, ) return api_pb2.ExternalPipelineSubsetSnapshotReply( serialized_external_pipeline_subset_result= serialize_dagster_namedtuple( get_external_pipeline_subset_result( recon_pipeline_from_origin( pipeline_subset_snapshot_args.pipeline_origin), pipeline_subset_snapshot_args.solid_selection, )))
def external_schedule_execution(self, external_schedule_execution_args): check.inst_param( external_schedule_execution_args, "external_schedule_execution_args", ExternalScheduleExecutionArgs, ) res = self._query( "ExternalScheduleExecution", api_pb2.ExternalScheduleExecutionRequest, serialized_external_schedule_execution_args=serialize_dagster_namedtuple( external_schedule_execution_args ), ) return deserialize_json_to_dagster_namedtuple( res.serialized_external_schedule_execution_data_or_external_schedule_execution_error )
def external_partition_set_execution_params(self, partition_set_execution_param_args): check.inst_param( partition_set_execution_param_args, "partition_set_execution_param_args", PartitionSetExecutionParamArgs, ) res = self._query( "ExternalPartitionSetExecutionParams", api_pb2.ExternalPartitionSetExecutionParamsRequest, serialized_partition_set_execution_param_args=serialize_dagster_namedtuple( partition_set_execution_param_args ), ) return deserialize_json_to_dagster_namedtuple( res.serialized_external_partition_set_execution_param_data_or_external_partition_execution_error )
def execute_run_command(input_json): with capture_interrupts(): args = check.inst(deserialize_json_to_dagster_namedtuple(input_json), ExecuteRunArgs) recon_pipeline = recon_pipeline_from_origin(args.pipeline_origin) with (DagsterInstance.from_ref(args.instance_ref) if args.instance_ref else DagsterInstance.get()) as instance: buffer = [] def send_to_buffer(event): buffer.append(serialize_dagster_namedtuple(event)) _execute_run_command_body(recon_pipeline, args.pipeline_run_id, instance, send_to_buffer) for line in buffer: click.echo(line)
def wipe_asset(self, asset_key): check.inst_param(asset_key, "asset_key", AssetKey) event_query = db.select([ SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event ]).where(SqlEventLogStorageTable.c.asset_key == asset_key.to_string()) asset_key_delete = AssetKeyTable.delete().where( # pylint: disable=no-value-for-parameter AssetKeyTable.c.asset_key == asset_key.to_string()) with self.connect() as conn: conn.execute(asset_key_delete) results = conn.execute(event_query).fetchall() for row_id, json_str in results: try: event_record = deserialize_json_to_dagster_namedtuple(json_str) if not isinstance(event_record, EventRecord): continue assert event_record.dagster_event.event_specific_data.materialization.asset_key dagster_event = event_record.dagster_event event_specific_data = dagster_event.event_specific_data materialization = event_specific_data.materialization updated_materialization = Materialization( label=materialization.label, description=materialization.description, metadata_entries=materialization.metadata_entries, asset_key=None, skip_deprecation_warning=True, ) updated_event_specific_data = event_specific_data._replace( materialization=updated_materialization) updated_dagster_event = dagster_event._replace( event_specific_data=updated_event_specific_data) updated_record = event_record._replace( dagster_event=updated_dagster_event) # update the event_record here self.update_event_log_record(row_id, updated_record) except seven.JSONDecodeError: logging.warning( "Could not parse asset event record id `{}`.".format( row_id))
def test_basic_dep_fan_out(snapshot): @solid def return_one(_): return 1 @solid(input_defs=[InputDefinition("value", int)]) def passthrough(_, value): return value @pipeline def single_dep_pipeline(): return_one_result = return_one() passthrough.alias("passone")(return_one_result) passthrough.alias("passtwo")(return_one_result) dep_structure_snapshot = build_dep_structure_snapshot_from_icontains_solids( single_dep_pipeline.graph ) index = DependencyStructureIndex(dep_structure_snapshot) assert index.get_invocation("return_one") assert index.get_invocation("passone") assert index.get_invocation("passtwo") assert index.get_upstream_output("passone", "value") == OutputHandleSnap("return_one", "result") assert index.get_upstream_output("passtwo", "value") == OutputHandleSnap("return_one", "result") assert set(index.get_downstream_inputs("return_one", "result")) == set( [ InputHandle("passthrough", "passone", "value"), InputHandle("passthrough", "passtwo", "value"), ] ) assert ( deserialize_json_to_dagster_namedtuple(serialize_dagster_namedtuple(dep_structure_snapshot)) == dep_structure_snapshot ) pipeline_snapshot = PipelineSnapshot.from_pipeline_def(single_dep_pipeline) assert pipeline_snapshot == serialize_rt(pipeline_snapshot) snapshot.assert_match(serialize_pp(pipeline_snapshot)) snapshot.assert_match(create_pipeline_snapshot_id(pipeline_snapshot))
def external_partition_set_execution_params( self, partition_set_execution_param_args): check.inst_param( partition_set_execution_param_args, "partition_set_execution_param_args", PartitionSetExecutionParamArgs, ) chunks = list( self._streaming_query( "ExternalPartitionSetExecutionParams", api_pb2.ExternalPartitionSetExecutionParamsRequest, serialized_partition_set_execution_param_args= serialize_dagster_namedtuple( partition_set_execution_param_args), )) return deserialize_json_to_dagster_namedtuple("".join( [chunk.serialized_chunk for chunk in chunks]))
def filter_dagster_events_from_pod_logs(log_lines): """ Filters the raw log lines from a dagster-cli invocation to return only the lines containing json. - Log lines don't necessarily come back in order - Something else might log JSON - Docker appears to silently split very long log lines -- this is undocumented behavior TODO: replace with reading event logs from the DB """ check.list_param(log_lines, "log_lines", str) coalesced_lines = [] buffer = [] in_split_line = False for line in log_lines: line = line.strip() if not in_split_line and line.startswith("{"): if line.endswith("}"): coalesced_lines.append(line) else: buffer.append(line) in_split_line = True elif in_split_line: buffer.append(line) if line.endswith( "}" ): # Note: hack, this may not have been the end of the full object coalesced_lines.append("".join(buffer)) buffer = [] in_split_line = False events = [] for line in coalesced_lines: try: events.append(deserialize_json_to_dagster_namedtuple(line)) except seven.JSONDecodeError: pass except check.CheckError: pass return events