def _postgres_event_watcher_event_loop(conn_string, queue, run_id_dict): init_called = False queue.put(EventWatcherProcessStartedEvent()) try: for notif in await_pg_notifications(conn_string, channels=[CHANNEL_NAME], timeout=POLLING_CADENCE, yield_on_timeout=True): if not init_called: init_called = True queue.put(EventWatcherStart()) if notif is not None: event_record = deserialize_json_to_dagster_namedtuple( notif.payload) if event_record.run_id in run_id_dict: queue.put(EventWatcherEvent(event_record)) else: # The polling window has timed out pass except Exception as e: # pylint: disable=broad-except queue.put(EventWatchFailed(message=str(e))) finally: queue.put(EventWatcherEnd())
def test_kitchen_sink(): kitchen_sink = List[Dict({ 'opt_list_of_int': Field(List[int], is_optional=True), 'tuple_of_things': Field(Tuple[int, str]), 'nested_dict': Field( Dict({ 'list_list': Field(List[List[int]]), 'nested_selector': Field( Selector({ 'some_field': Field(int), 'set': Field(Optional[Set[bool]]) })), })), })] kitchen_sink_meta = meta_from_dagster_type(kitchen_sink) rehydrated_meta = deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(kitchen_sink_meta)) assert kitchen_sink_meta == rehydrated_meta
def handle_execute_plan_result_raw(res): res_data = res['data']['executePlan'] res_type = res_data['__typename'] handle_error_states(res_type, res_data) if res_type == 'ExecutePlanSuccess': raw_event_records = [ DagsterEventRecord( event_record.error_info, event_record.message, event_record.level, event_record.user_message, event_record.run_id, event_record.timestamp, event_record.step_key, event_record.pipeline_name, event_record.dagster_event, ) for event_record in [ deserialize_json_to_dagster_namedtuple(e) for e in res_data['rawEventRecords'] ] ] return raw_event_records raise DagsterGraphQLClientError('Unexpected result type')
def get_logs_for_run(self, run_id, cursor=-1): check.str_param(run_id, 'run_id') check.int_param(cursor, 'cursor') check.invariant( cursor >= -1, 'Don\'t know what to do with negative cursor {cursor}'.format(cursor=cursor), ) events = [] if not os.path.exists(self.filepath_for_run_id(run_id)): return events cursor += 1 # adjust from 0 based offset to 1 try: with self._connect(run_id) as conn: results = conn.cursor().execute(FETCH_EVENTS_SQL, (str(cursor),)).fetchall() except sqlite3.Error as err: six.raise_from(EventLogInvalidForRun(run_id=run_id), err) try: for (json_str,) in results: events.append( check.inst_param( deserialize_json_to_dagster_namedtuple(json_str), 'event', EventRecord ) ) except (seven.JSONDecodeError, check.CheckError) as err: six.raise_from(EventLogInvalidForRun(run_id=run_id), err) return events
def _load_schedules(self): schedules_dir = os.path.join(self._base_dir) utils.mkdir_p(schedules_dir) for repository_name in os.listdir(schedules_dir): if not os.path.isdir(os.path.join(schedules_dir, repository_name)): continue self._schedules[repository_name] = {} for file in os.listdir(os.path.join(schedules_dir, repository_name)): if not file.endswith('.json'): continue file_path = os.path.join(schedules_dir, repository_name, file) with open(file_path) as data: try: schedule = deserialize_json_to_dagster_namedtuple( data.read()) self._schedules[repository_name][ schedule.name] = schedule except Exception as ex: # pylint: disable=broad-except warnings.warn( 'Could not parse dagster schedule from {file_name} in {dir_name}. ' '{ex}: {msg}'.format( file_name=file, dir_name=self._base_dir, ex=type(ex).__name__, msg=ex, )) continue
def watcher_thread(conn_string, queue, handlers_dict, dict_lock, watcher_thread_exit): done = False while not done and not watcher_thread_exit.is_set(): event_list = [] while not queue.empty(): try: event_list.append(queue.get_nowait()) except Empty: pass for event in event_list: if not isinstance(event, EventWatcherThreadEvents): warnings.warn( 'Event watcher thread got unexpected event {event}'.format(event=event) ) continue if isinstance(event, EventWatcherThreadNoopEvents): continue elif isinstance(event, EventWatcherThreadEndEvents): done = True else: assert isinstance(event, EventWatcherEvent) run_id, index_str = event.payload index = int(index_str) with dict_lock: handlers = handlers_dict.get(run_id, []) with get_conn(conn_string).cursor() as curs: curs.execute(SELECT_EVENT_LOG_SQL, (index,)) dagster_event = deserialize_json_to_dagster_namedtuple(curs.fetchone()[0]) for (cursor, callback) in handlers: if index >= cursor: callback(dagster_event) time.sleep(WATCHER_POLL_INTERVAL)
def test_basic_event_store(): @solid def return_one(_): return 1 def _solids(): return_one() events, _result = gather_events(_solids) event_log_storage = PostgresEventLogStorage.create_nuked_storage( get_test_conn_string()) for event in events: event_log_storage.store_event(event) rows = fetch_all_events(get_test_conn_string()) out_events = list( map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows)) assert list(map(lambda e: e.dagster_event.event_type, out_events)) == [ DagsterEventType.PIPELINE_START, DagsterEventType.ENGINE_EVENT, DagsterEventType.STEP_START, DagsterEventType.STEP_OUTPUT, DagsterEventType.STEP_SUCCESS, DagsterEventType.ENGINE_EVENT, DagsterEventType.PIPELINE_SUCCESS, ]
def test_dead_events(): snapshot = path.join(path.dirname(path.realpath(__file__)), 'dead_events.txt') with open(snapshot, 'r') as fd: objs = [] for line in fd.readlines(): obj = deserialize_json_to_dagster_namedtuple(line) assert obj is not None objs.append(obj) assert len(objs) == 6
def test_basic_solid_definition(): @solid def noop_solid(_): pass solid_snap = build_solid_def_snap(noop_solid) assert solid_snap assert (deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(solid_snap)) == solid_snap)
def get_schedule_by_name(self, repository, schedule_name): check.inst_param(repository, 'repository', RepositoryDefinition) check.str_param(schedule_name, 'schedule_name') query = (db.select( [ScheduleTable.c.schedule_body]).select_from(ScheduleTable).where( ScheduleTable.c.repository_name == repository.name).where( ScheduleTable.c.schedule_name == schedule_name)) rows = self.execute(query) return deserialize_json_to_dagster_namedtuple( rows[0][0]) if len(rows) else None
def _load_historic_runs(self): for filename in glob.glob(os.path.join(self._base_dir, '*.json')): with open(filename, 'r') as fd: try: pipeline_run = deserialize_json_to_dagster_namedtuple( fd.read()) self.add_run(pipeline_run) except Exception as ex: # pylint: disable=broad-except print( 'Could not load pipeline run from {filename}, continuing.\n Original ' 'exception: {ex}: {msg}'.format(filename=filename, ex=type(ex).__name__, msg=ex)) continue
def get_run_by_id(self, run_id): '''Get a run by its id. Args: run_id (str): The id of the run Returns: Optional[PipelineRun] ''' check.str_param(run_id, 'run_id') query = db.select([RunsTable.c.run_body]).where(RunsTable.c.run_id == run_id) rows = self.execute(query) return deserialize_json_to_dagster_namedtuple(rows[0][0]) if len(rows) else None
def get_schedule_ticks_by_schedule(self, repository, schedule_name): check.inst_param(repository, 'repository', RepositoryDefinition) check.str_param(schedule_name, 'schedule_name') query = (db.select([ ScheduleTickTable.c.id, ScheduleTickTable.c.tick_body ]).select_from(ScheduleTickTable).where( ScheduleTickTable.c.repository_name == repository.name).where( ScheduleTickTable.c.schedule_name == schedule_name)) rows = self.execute(query) return list( map( lambda r: ScheduleTick( r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows))
def test_simple_pipeline_smoke_test(): @solid def solid_without_config(_): pass @pipeline def single_solid_pipeline(): solid_without_config() config_schema_snapshot = build_config_schema_snapshot( single_solid_pipeline) assert config_schema_snapshot.all_config_snaps_by_key serialized = serialize_dagster_namedtuple(config_schema_snapshot) rehydrated_config_schema_snapshot = deserialize_json_to_dagster_namedtuple( serialized) assert config_schema_snapshot == rehydrated_config_schema_snapshot
def on_created(self, event): run_id, _extension = os.path.basename(event.src_path).split('.') # if we already know about the run, we kicked it off with self._lock: if self._run_storage.has_run(run_id): return with open(event.src_path, 'r') as fd: try: pipeline_run = deserialize_json_to_dagster_namedtuple(fd.read()) self._run_storage.add_external_run(pipeline_run, event.src_path) except Exception as ex: # pylint: disable=broad-except warnings.warn( 'Error trying to load .json metadata file in filesystem run ' 'storage: {ex}: {msg}'.format(ex=type(ex).__name__, msg=ex) ) return
def get_run_by_id(self, run_id): '''Get a run by its id. Args: run_id (str): THe id of the run Returns: Optional[PipelineRun] ''' check.str_param(run_id, 'run_id') conn = get_conn(self.conn_string) with conn.cursor() as curs: curs.execute('SELECT run_body FROM runs WHERE run_id = %s', (run_id, )) rows = curs.fetchall() return deserialize_json_to_dagster_namedtuple( rows[0][0]) if len(rows) else None
def test_kitchen_sink(): kitchen_sink = resolve_to_config_type([{ 'opt_list_of_int': Field(int, is_optional=True), 'nested_dict': { 'list_list': [[int]], 'nested_selector': Field(Selector({ 'some_field': int, 'more_list': Noneable([bool]) })), }, }]) kitchen_sink_meta = meta_from_dagster_type(kitchen_sink) rehydrated_meta = deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(kitchen_sink_meta)) assert kitchen_sink_meta == rehydrated_meta
def get_logs_for_run(self, run_id, cursor=-1): '''Get all of the logs corresponding to a run. Args: run_id (str): The id of the run for which to fetch logs. cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1, i.e., if cursor is -1, all logs will be returned. (default: -1) ''' check.str_param(run_id, 'run_id') check.int_param(cursor, 'cursor') check.invariant(cursor >= -1, 'Cursor must be -1 or greater') with get_conn(self.conn_string).cursor() as curs: FETCH_SQL = 'SELECT event_body FROM event_log WHERE run_id = %s OFFSET %s;' curs.execute(FETCH_SQL, (run_id, cursor + 1)) rows = curs.fetchall() return list( map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))
def watcher_thread(conn_string, run_id_dict, handlers_dict, dict_lock, watcher_thread_exit): try: for notif in await_pg_notifications( conn_string, channels=[CHANNEL_NAME], timeout=POLLING_CADENCE, yield_on_timeout=True, exit_event=watcher_thread_exit, ): if notif is None: if watcher_thread_exit.is_set(): break else: run_id, index_str = notif.payload.split('_') if run_id not in run_id_dict: continue index = int(index_str) with dict_lock: handlers = handlers_dict.get(run_id, []) engine = create_engine(conn_string, isolation_level='AUTOCOMMIT', poolclass=db.pool.NullPool) try: res = engine.execute( db.select([ SqlEventLogStorageTable.c.event ]).where(SqlEventLogStorageTable.c.id == index), ) dagster_event = deserialize_json_to_dagster_namedtuple( res.fetchone()[0]) finally: engine.dispose() for (cursor, callback) in handlers: if index >= cursor: callback(dagster_event) except psycopg2.OperationalError: pass
def get_logs_for_run(self, run_id, cursor=-1): '''Get all of the logs corresponding to a run. Args: run_id (str): The id of the run for which to fetch logs. cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1, i.e., if cursor is -1, all logs will be returned. (default: -1) ''' check.str_param(run_id, 'run_id') check.int_param(cursor, 'cursor') check.invariant( cursor >= -1, 'Don\'t know what to do with negative cursor {cursor}'.format(cursor=cursor), ) # cursor starts at 0 & auto-increment column starts at 1 so adjust cursor = cursor + 1 query = ( db.select([SqlEventLogStorageTable.c.event]) .where(SqlEventLogStorageTable.c.run_id == run_id) .where(SqlEventLogStorageTable.c.id > cursor) .order_by(SqlEventLogStorageTable.c.id.asc()) ) with self.connect(run_id) as conn: results = conn.execute(query).fetchall() events = [] try: for (json_str,) in results: events.append( check.inst_param( deserialize_json_to_dagster_namedtuple(json_str), 'event', EventRecord ) ) except (seven.JSONDecodeError, check.CheckError) as err: six.raise_from(DagsterEventLogInvalidForRun(run_id=run_id), err) return events
def test_basic_event_store(conn_string): @solid def return_one(_): return 1 def _solids(): return_one() events, _result = gather_events(_solids) event_log_storage = PostgresEventLogStorage.create_clean_storage( conn_string) for event in events: event_log_storage.store_event(event) rows = fetch_all_events(conn_string) out_events = list( map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows)) # messages can come out of order assert Counter(event_types(out_events)) == Counter([ DagsterEventType.PIPELINE_START, DagsterEventType.ENGINE_EVENT, DagsterEventType.STEP_START, DagsterEventType.STEP_SUCCESS, DagsterEventType.PIPELINE_SUCCESS, DagsterEventType.STEP_OUTPUT, DagsterEventType.ENGINE_EVENT, ]) assert (sorted_event_types(out_events)) == [ DagsterEventType.PIPELINE_START, DagsterEventType.ENGINE_EVENT, DagsterEventType.STEP_START, DagsterEventType.STEP_OUTPUT, DagsterEventType.STEP_SUCCESS, DagsterEventType.ENGINE_EVENT, DagsterEventType.PIPELINE_SUCCESS, ]
def test_solid_definition_kitchen_sink(): @solid( input_defs=[ InputDefinition('arg_one', str, description='desc1'), InputDefinition('arg_two', int), ], output_defs=[ OutputDefinition(name='output_one', dagster_type=str), OutputDefinition(name='output_two', dagster_type=int, description='desc2', is_required=False), ], config={'foo': int}, description='a description', tags={'a_tag': 'yup'}, required_resource_keys={'a_resource'}, ) def kitchen_sink_solid(_, arg_two, arg_one): # out of order to test positional_inputs assert arg_one assert arg_two raise Exception('should not execute') kitchen_sink_solid_snap = build_solid_def_snap(kitchen_sink_solid) assert kitchen_sink_solid_snap assert kitchen_sink_solid_snap.name == 'kitchen_sink_solid' assert len(kitchen_sink_solid_snap.input_def_snaps) == 2 assert [inp.name for inp in kitchen_sink_solid_snap.input_def_snaps ] == ['arg_one', 'arg_two'] assert [ inp.dagster_type_key for inp in kitchen_sink_solid_snap.input_def_snaps ] == [ 'String', 'Int', ] assert kitchen_sink_solid_snap.get_input_snap( 'arg_one').description == 'desc1' assert [out.name for out in kitchen_sink_solid_snap.output_def_snaps] == [ 'output_one', 'output_two', ] assert [ out.dagster_type_key for out in kitchen_sink_solid_snap.output_def_snaps ] == [ 'String', 'Int', ] assert kitchen_sink_solid_snap.get_output_snap( 'output_two').description == 'desc2' assert kitchen_sink_solid_snap.get_output_snap( 'output_two').is_required is False assert (kitchen_sink_solid_snap.config_field_snap.type_key == kitchen_sink_solid.config_field.config_type.key) assert kitchen_sink_solid_snap.required_resource_keys == ['a_resource'] assert kitchen_sink_solid_snap.tags == {'a_tag': 'yup'} assert kitchen_sink_solid.positional_inputs == ['arg_two', 'arg_one'] assert (deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(kitchen_sink_solid_snap)) == kitchen_sink_solid_snap)
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) pipeline_name = pipeline_context.pipeline_def.name handle_dict = pipeline_context.execution_target_handle.to_dict() instance_ref_dict = pipeline_context.instance.get_ref().to_dict() environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) mode = pipeline_context.mode_def.name run_id = pipeline_context.pipeline_run.run_id app = make_app(celery_config) task_signatures = {} # Dict[step_key, celery.Signature] apply_kwargs = defaultdict(dict) # Dict[step_key, Dict[str, Any]] priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority))) priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][ 'priority']) _warn_on_priority_misuse(pipeline_context, execution_plan) for step_key in execution_plan.step_keys_to_execute: step = execution_plan.get_step_by_key(step_key) priority = int( step.tags.get('dagster-celery/priority', task_default_priority)) queue = step.tags.get('dagster-celery/queue', task_default_queue) task = create_task(app) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': mode, 'executionMetadata': { 'runId': run_id }, 'stepKeys': [step_key], } } task_signatures[step_key] = task.si(handle_dict, variables, instance_ref_dict) apply_kwargs[step_key] = { 'priority': priority, 'queue': queue, 'routing_key': '{queue}.execute_query'.format(queue=queue), } step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_success = {} step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start(sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event if event.is_step_success: step_success[step_key] = True elif event.is_step_failure: step_success[step_key] = False results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] was_success = step_success.get(step_key) if was_success == True: active_execution.mark_success(step_key) elif was_success == False: active_execution.mark_failed(step_key) else: # check errors list? pipeline_context.log.error( 'Step {key} finished without success or failure event, assuming failure.' .format(key=step_key)) active_execution.mark_failed(step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # dont add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: step_results[step.key] = task_signatures[ step.key].apply_async(**apply_kwargs[step.key]) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occured in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config pipeline_name = pipeline_context.pipeline_def.name handle_dict = pipeline_context.execution_target_handle.to_dict() instance_ref_dict = pipeline_context.instance.get_ref().to_dict() environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) mode = pipeline_context.mode_def.name run_id = pipeline_context.pipeline_run.run_id app = make_app(celery_config) pending_steps = execution_plan.execution_deps() task_signatures = {} # Dict[step_key, celery.Signature] apply_kwargs = defaultdict(dict) # Dict[step_key, Dict[str, Any]] sort_by_priority = lambda step_key: (-1 * apply_kwargs[step_key][ 'priority']) for step_key in execution_plan.step_keys_to_execute: step = execution_plan.get_step_by_key(step_key) priority = step.metadata.get('dagster-celery/priority', task_default_priority) queue = step.metadata.get('dagster-celery/queue', task_default_queue) task = create_task(app) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': mode, 'executionMetadata': { 'runId': run_id }, 'stepKeys': [step_key], } } task_signatures[step_key] = task.si(handle_dict, variables, instance_ref_dict) apply_kwargs[step_key] = { 'priority': priority, 'queue': queue, 'routing_key': '{queue}.execute_query'.format(queue=queue), } step_results = {} # Dict[ExecutionStep, celery.AsyncResult] completed_steps = set({}) # Set[step_key] while pending_steps or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: sort_by_priority(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] for step_event in step_events: yield deserialize_json_to_dagster_namedtuple( step_event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] pending_to_pop = [] for step_key, requirements in pending_steps.items(): if requirements.issubset(completed_steps): pending_to_pop.append(step_key) # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. to_execute = sorted(pending_to_pop, key=sort_by_priority) for step_key in to_execute: try: step_results[step_key] = task_signatures[ step_key].apply_async(**apply_kwargs[step_key]) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise for step_key in pending_to_pop: if step_key in pending_steps: del pending_steps[step_key] time.sleep(TICK_SECONDS)
def _rows_to_runs(self, rows): return list(map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))
def get_run_by_id(self, run_id): path = self._known_runs[run_id] with open(path, 'r') as fd: return deserialize_json_to_dagster_namedtuple(fd.read())
def test_deserialize_json_to_dagster_namedtuple_types_ok(): unpacked_tuple = deserialize_json_to_dagster_namedtuple('{"foo": "bar"}') assert unpacked_tuple assert unpacked_tuple['foo'] == 'bar'
def test_deserialize_json_to_dagster_namedtyple_invalid_types(bad_obj): with pytest.raises(ParameterCheckError): deserialize_json_to_dagster_namedtuple(bad_obj)
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get('dagster-celery/queue', task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) step_results[step.key] = _submit_task( app, pipeline_context, step, queue) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def from_json(json_str): return deserialize_json_to_dagster_namedtuple(json_str)