def _generate_run_name(self, af_context: Optional[AirflowTaskContext]) -> str: """ If this is an airflow run, generate a name to reflect it still awaits a sync. Otherwise, generate a human friendly name for the run. """ if af_context is not None: return f"Airflow-run-await-sync_{self.run_uid}" return get_random_name(seed=self.run_uid)
def run(self): """ Generates bogus data and writes it into the :py:meth:`~.Streams.output` target. """ logger.warning("Hey, this is streams task!") with self.output().open("w") as output: for _ in range(1000): output.write("{} {} {}\n".format( random.randint(0, 999), get_random_name(), random.randint(0, 999), )) log_metric("lines", 1000)
def __init__( self, context, # type: DatabandContext job_name, run_uid=None, # type: Optional[UUID] scheduled_run_info=None, # type: Optional[ScheduledRunInfo] existing_run=None, source=UpdateSource.dbnd, # type:Optional[UpdateSource] af_context=None, is_orchestration=False, ): self.context = context s = self.context.settings # type: DatabandSettings self.job_name = job_name self.description = s.run.description self.is_archived = s.run.is_archived self.source = source self.is_orchestration = is_orchestration self.existing_run = existing_run or False # this was added to allow the scheduler to create the run which will be continued by the actually run command instead of having 2 separate runs if not run_uid and DBND_RUN_UID in os.environ: # we pop so if this run spawnes subprocesses with their own runs they will be associated using the sub-runs mechanism instead # of being fused into this run directly run_uid = os.environ.pop(DBND_RUN_UID) if run_uid: self.run_uid = run_uid self.existing_run = True else: self.run_uid = get_uuid() # if user provided name - use it # otherwise - generate human friendly name for the run self.name = s.run.name or get_random_name(seed=self.run_uid) self.execution_date = unique_execution_date() self.is_tracked = True # tracking/orchestration main task self.root_task = None # type: Optional[Task] # task run that wraps execution (tracking or orchestration) self._driver_task_run = None # ORCHESTRATION: execution of the run self.run_executor = None # type: Optional[RunExecutor] # dag_id , execution_date are used by Airflow, # should be deprecated (still used by DB tracking) self.dag_id = AD_HOC_DAG_PREFIX + self.job_name # RUN STATE self._run_state = None self.task_runs = [] # type: List[TaskRun] self.task_runs_by_id = {} self.task_runs_by_af_id = {} self.target_origin = TargetIdentitySourceMap() self.describe = RunBanner(self) self.tracker = RunTracker(self, tracking_store=self.context.tracking_store) # ALL RUN CONTEXT SPECIFIC thing self.root_run_info = RootRunInfo.from_env(current_run=self) self.scheduled_run_info = scheduled_run_info or ScheduledRunInfo.from_env( self.run_uid) self.env = self.context.env self.run_folder_prefix = os.path.join( "log", self.execution_date.strftime("%Y-%m-%d"), "%s_%s_%s" % ( self.execution_date.strftime("%Y-%m-%dT%H%M%S.%f"), self.job_name, self.name, ), ) self.run_root = self.env.dbnd_root.folder(self.run_folder_prefix) self.run_local_root = self.env.dbnd_local_root.folder( self.run_folder_prefix) self.local_engine = build_engine_config( self.env.local_engine).clone(require_submit=False) self.dynamic_af_tasks_count = dict() self.af_context = af_context self.start_time = None self.finished_time = None
def __init__( self, context, task_or_task_name, run_uid=None, scheduled_run_info=None, send_heartbeat=True, existing_run=None, job_name=None, ): # type:(DatabandContext, Union[Task, str] , Optional[UUID], Optional[ScheduledRunInfo], Optional[bool]) -> None self.context = context s = self.context.settings # type: DatabandSettings if isinstance(task_or_task_name, six.string_types): self.root_task_name = task_or_task_name self.root_task = None elif isinstance(task_or_task_name, Task): self.root_task_name = task_or_task_name.task_name self.root_task = task_or_task_name else: raise self.job_name = job_name or self.root_task_name self.name = s.run.name or get_random_name() self.description = s.run.description self.is_archived = s.run.is_archived # this was added to allow the scheduler to create the run which will be continued by the actually run command instead of having 2 separate runs if not run_uid and DBND_RUN_UID in os.environ: # we pop so if this run spawnes subprocesses with their own runs they will be associated using the sub-runs mechanism instead # of being fused into this run directly run_uid = os.environ.pop(DBND_RUN_UID) if run_uid: self.run_uid = run_uid self.existing_run = True else: self.run_uid = get_uuid() self.existing_run = False if existing_run is not None: self.existing_run = existing_run # this is so the scheduler can create a run with partial information and then have the subprocess running the actual cmd fill in the details self.resubmit_run = (DBND_RESUBMIT_RUN in os.environ and os.environ.pop(DBND_RESUBMIT_RUN) == "true") # AIRFLOW, move into executor # dag_id , execution_date and run_id is used by airflow self.dag_id = self.root_task_name self.execution_date = unique_execution_date() run_id = s.run.id if not run_id: # we need this name, otherwise Airflow will try to manage our local jobs at scheduler # ..zombies cleanup and so on run_id = "backfill_{0}_{1}".format(self.name, self.execution_date.isoformat()) self.run_id = run_id self._template_vars = self._build_template_vars() self.is_tracked = True self.runtime_errors = [] self._run_state = None self.task_runs = [] # type: List[TaskRun] self.task_runs_by_id = {} self.task_runs_by_af_id = {} self.target_origin = TargetIdentitySourceMap() self.describe = DescribeRun(self) self.tracker = RunTracker(self, tracking_store=self.context.tracking_store) # ALL RUN CONTEXT SPECIFIC thing self.root_run_info = RootRunInfo.from_env(current_run=self) self.scheduled_run_info = scheduled_run_info or ScheduledRunInfo.from_env( self.run_uid) # now we can add driver task self.driver_task_run = None # type: Optional[TaskRun] self.root_task_run = None # type: Optional[TaskRun] self.run_folder_prefix = os.path.join( "log", self.execution_date.strftime("%Y-%m-%d"), "%s_%s_%s" % ( self.execution_date.strftime("%Y-%m-%dT%H%M%S.%f"), self.root_task_name, self.name, ), ) self.run_config = self.context.settings.run # type: RunConfig self.env = env = self.context.env self.local_engine = self._get_engine_config(env.local_engine) self.remote_engine = self._get_engine_config(env.remote_engine or env.local_engine) self.submit_driver = (self.run_config.submit_driver if self.run_config.submit_driver is not None else env.submit_driver) self.submit_tasks = (self.run_config.submit_tasks if self.run_config.submit_tasks is not None else env.submit_tasks) self.task_executor_type, self.parallel = calculate_task_executor_type( self.submit_tasks, self.remote_engine, self.context.settings) self.sends_heartbeat = send_heartbeat