def streaming_ping(self, sequence_length, echo): check.int_param(sequence_length, "sequence_length") check.str_param(echo, "echo") for res in self._streaming_query( "StreamingPing", api_pb2.StreamingPingRequest, sequence_length=sequence_length, echo=echo, ): yield { "sequence_number": res.sequence_number, "echo": res.echo, }
def streaming_ping(self, sequence_length, echo): check.int_param(sequence_length, 'sequence_length') check.str_param(echo, 'echo') for res in self._streaming_query( 'StreamingPing', api_pb2.StreamingPingRequest, sequence_length=sequence_length, echo=echo, ): yield { 'sequence_number': res.sequence_number, 'echo': res.echo, }
def ephemeral_grpc_api_client( loadable_target_origin=None, force_port=False, max_retries=10, max_workers=1 ): check.opt_inst_param(loadable_target_origin, 'loadable_target_origin', LoadableTargetOrigin) check.bool_param(force_port, 'force_port') check.int_param(max_retries, 'max_retries') with GrpcServerProcess( loadable_target_origin=loadable_target_origin, force_port=force_port, max_retries=max_retries, max_workers=max_workers, ).create_ephemeral_client() as client: yield client
def __init__(self, num_allowed_rows, error_tolerance=0): self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows") self.error_tolerance = abs( check.int_param(error_tolerance, "error_tolerance")) if self.error_tolerance > self.num_allowed_rows: raise ValueError( "Tolerance can't be greater than the number of rows you expect." ) description = "Dataframe must have {} +- {} rows.".format( self.num_allowed_rows, self.error_tolerance) super(RowCountConstraint, self).__init__(error_description=description, markdown_description=description)
def from_level(cls, level): check.int_param(level, 'level') if level == logging.CRITICAL: return DauphinLogLevel.CRITICAL elif level == logging.ERROR: return DauphinLogLevel.ERROR elif level == logging.INFO: return DauphinLogLevel.INFO elif level == logging.WARNING: return DauphinLogLevel.WARNING elif level == logging.DEBUG: return DauphinLogLevel.DEBUG else: check.failed('Invalid log level: {level}'.format(level=level))
def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20): """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5 minutes, so this may take a long time. Args: log_bucket (str): S3 bucket where log is expected to appear log_key (str): S3 key for the log file waiter_delay (int): How long to wait between attempts to check S3 for the log file waiter_max_attempts (int): Number of attempts before giving up on waiting Raises: EmrError: Raised if we waited the full duration and the logs did not appear Returns: str: contents of the log file """ check.str_param(log_bucket, "log_bucket") check.str_param(log_key, "log_key") check.int_param(waiter_delay, "waiter_delay") check.int_param(waiter_max_attempts, "waiter_max_attempts") log.info("Attempting to get log: s3://{log_bucket}/{log_key}".format( log_bucket=log_bucket, log_key=log_key)) s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every) waiter = s3.get_waiter("object_exists") try: waiter.wait( Bucket=log_bucket, Key=log_key, WaiterConfig={ "Delay": waiter_delay, "MaxAttempts": waiter_max_attempts }, ) except WaiterError as err: six.raise_from( EmrError("EMR log file did not appear on S3 after waiting"), err, ) obj = BytesIO( s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read()) gzip_file = gzip.GzipFile(fileobj=obj) return gzip_file.read().decode("utf-8")
def __init__( self, shutdown_server_event, loadable_target_origin=None, heartbeat=False, heartbeat_timeout=30, ): super(DagsterApiServer, self).__init__() check.bool_param(heartbeat, "heartbeat") check.int_param(heartbeat_timeout, "heartbeat_timeout") check.invariant(heartbeat_timeout > 0, "heartbeat_timeout must be greater than 0") self._shutdown_server_event = check.inst_param( shutdown_server_event, "shutdown_server_event", seven.ThreadingEventType) self._loadable_target_origin = check.opt_inst_param( loadable_target_origin, "loadable_target_origin", LoadableTargetOrigin) # Dict[str, (multiprocessing.Process, DagsterInstance)] self._executions = {} # Dict[str, multiprocessing.Event] self._termination_events = {} self._termination_times = {} self._execution_lock = threading.Lock() self._repository_symbols_and_code_pointers = LazyRepositorySymbolsAndCodePointers( loadable_target_origin) self.__last_heartbeat_time = time.time() if heartbeat: self.__heartbeat_thread = threading.Thread( target=self._heartbeat_thread, args=(heartbeat_timeout, ), ) self.__heartbeat_thread.daemon = True self.__heartbeat_thread.start() else: self.__heartbeat_thread = None self.__cleanup_thread = threading.Thread( target=self._cleanup_thread, args=(), ) self.__cleanup_thread.daemon = True self.__cleanup_thread.start()
def step_context_to_step_run_ref( step_context: SystemStepExecutionContext, prior_attempts_count: int, package_dir: Optional[str] = None, ) -> StepRunRef: """ Args: step_context (SystemStepExecutionContext): The step context. prior_attempts_count (int): The number of times this time has been tried before in the same pipeline run. package_dir (Optional[str]): If set, the reconstruction file code pointer will be converted to be relative a module pointer relative to the package root. This enables executing steps in remote setups where the package containing the pipeline resides at a different location on the filesystem in the remote environment than in the environment executing the plan process. Returns (StepRunRef): A reference to the step. """ check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.int_param(prior_attempts_count, "prior_attempts_count") retry_mode = step_context.retry_mode recon_pipeline = step_context.pipeline if package_dir: if isinstance(recon_pipeline, ReconstructablePipeline) and isinstance( recon_pipeline.repository.pointer, FileCodePointer): recon_pipeline = ReconstructablePipeline( repository=ReconstructableRepository(pointer=ModuleCodePointer( _module_in_package_dir( recon_pipeline.repository.pointer.python_file, package_dir), recon_pipeline.repository.pointer.fn_name, ), ), pipeline_name=recon_pipeline.pipeline_name, solids_to_execute=recon_pipeline.solids_to_execute, ) return StepRunRef( run_config=step_context.run_config, pipeline_run=step_context.pipeline_run, run_id=step_context.pipeline_run.run_id, step_key=step_context.step.key, retry_mode=retry_mode, recon_pipeline=recon_pipeline, prior_attempts_count=prior_attempts_count, )
def backoff( fn, retry_on, args=None, kwargs=None, max_retries=BACKOFF_MAX_RETRIES, delay_generator=backoff_delay_generator(), ): """Straightforward backoff implementation. Note that this doesn't implement any jitter on the delays, so probably won't be appropriate for very parallel situations. Args: fn (Callable): The function to wrap in a backoff/retry loop. retry_on (Tuple[Exception, ...]): The exception classes on which to retry. Note that we don't (yet) have any support for matching the exception messages. args (Optional[List[Any]]): Positional args to pass to the callable. kwargs (Optional[Dict[str, Any]]): Keyword args to pass to the callable. max_retries (Optional[Int]): The maximum number of times to retry a failed fn call. Set to 0 for no backoff. Default: 4 delay_generator (Generator[float, None, None]): Generates the successive delays between retry attempts. """ check.callable_param(fn, "fn") retry_on = check.tuple_param(retry_on, "retry_on") args = check.opt_list_param(args, "args") kwargs = check.opt_dict_param(kwargs, "kwargs", key_type=str) check.int_param(max_retries, "max_retries") check.generator_param(delay_generator, "delay_generator") retries = 0 to_raise = None try: return fn(*args, **kwargs) except retry_on as exc: to_raise = exc while retries < max_retries: time.sleep(six.next(delay_generator)) try: return fn(*args, **kwargs) except retry_on as exc: retries += 1 to_raise = exc continue raise to_raise
def open_server_process( port, socket, loadable_target_origin=None, max_workers=1, heartbeat=False, heartbeat_timeout=30, lazy_load_user_code=False, ): check.invariant((port or socket) and not (port and socket), "Set only port or socket") check.opt_inst_param(loadable_target_origin, "loadable_target_origin", LoadableTargetOrigin) check.int_param(max_workers, "max_workers") subprocess_args = ( [ loadable_target_origin.executable_path if loadable_target_origin and loadable_target_origin.executable_path else sys.executable, "-m", "dagster.grpc", ] + (["--port", str(port)] if port else []) + (["--socket", socket] if socket else []) + ["-n", str(max_workers)] + (["--heartbeat"] if heartbeat else []) + (["--heartbeat-timeout", str(heartbeat_timeout)] if heartbeat_timeout else []) + (["--lazy-load-user-code"] if lazy_load_user_code else [])) if loadable_target_origin: subprocess_args += ( (["-f", loadable_target_origin.python_file] if loadable_target_origin.python_file else []) + (["-m", loadable_target_origin.module_name] if loadable_target_origin.module_name else []) + (["-d", loadable_target_origin.working_directory] if loadable_target_origin.working_directory else []) + (["-a", loadable_target_origin.attribute] if loadable_target_origin.attribute else [])) server_process = open_ipc_subprocess(subprocess_args, stdout=subprocess.PIPE) ready = wait_for_grpc_server(server_process) if ready: return server_process else: if server_process.poll() is None: server_process.terminate() return None
def __init__( self, # How long each process should run before a new process should be created the next # time a given origin is requested (which will pick up any changes that have been # made to the code) reload_interval, # How long the process can live without a heartbeat before it dies. You should ensure # that either heartbeat_ttl is greater than reload_interval (so that the process will reload # before it ends due to heartbeat failure), or if reload_interval is 0, that any processes # returned by this registry have at least one GrpcServerRepositoryLocation hitting the # server with a heartbeat while you want the process to stay running. heartbeat_ttl, # How long to wait for the server to start up and receive connections before timing out startup_timeout, ): # ProcessRegistryEntry map of servers being currently returned, keyed by origin ID self._active_entries = {} self._waited_for_processes = False check.invariant( heartbeat_ttl > reload_interval, "Heartbeat TTL must be larger than reload interval, or processes could die due to TTL failure before they are reloaded", ) self._reload_interval = check.int_param(reload_interval, "reload_interval") self._heartbeat_ttl = check.int_param(heartbeat_ttl, "heartbeat_ttl") self._startup_timeout = check.int_param(startup_timeout, "startup_timeout") self._lock = threading.Lock() self._all_processes = [] self._cleanup_thread_shutdown_event = None self._cleanup_thread = None if self._reload_interval > 0: self._cleanup_thread_shutdown_event = threading.Event() self._cleanup_thread = threading.Thread( target=self._clear_old_processes, name="grpc-server-registry-cleanup", args=(self._cleanup_thread_shutdown_event, self._reload_interval), ) self._cleanup_thread.daemon = True self._cleanup_thread.start()
def generate_pipeline(name, size, connect_factor=1.0): check.int_param(size, "size") check.invariant(size > 3, "Can not create pipelines with less than 3 nodes") check.float_param(connect_factor, "connect_factor") random.seed(name) # generate nodes solids = {} for i in range(size): num_inputs = random.randint(1, 3) num_outputs = random.randint(1, 3) num_cfg = random.randint(0, 5) solid_id = "{}_solid_{}".format(name, i) solids[solid_id] = generate_solid( solid_id=solid_id, num_inputs=num_inputs, num_outputs=num_outputs, num_cfg=num_cfg, ) solid_ids = list(solids.keys()) # connections deps = defaultdict(dict) for i in range(int(size * connect_factor)): # choose output out_idx = random.randint(0, len(solid_ids) - 2) out_solid_id = solid_ids[out_idx] output_solid = solids[out_solid_id] output_name = output_solid.output_defs[random.randint( 0, len(output_solid.output_defs) - 1)].name # choose input in_idx = random.randint(out_idx + 1, len(solid_ids) - 1) in_solid_id = solid_ids[in_idx] input_solid = solids[in_solid_id] input_name = input_solid.input_defs[random.randint( 0, len(input_solid.input_defs) - 1)].name # map deps[in_solid_id][input_name] = DependencyDefinition( out_solid_id, output_name) return PipelineDefinition(name=name, solid_defs=list(solids.values()), dependencies=deps)
def get_temp_file_names(number): check.int_param(number, "number") temp_file_names = list() for _ in itertools.repeat(None, number): handle, temp_file_name = tempfile.mkstemp() os.close( handle) # # just need the name - avoid leaking the file descriptor temp_file_names.append(temp_file_name) try: yield tuple(temp_file_names) finally: for temp_file_name in temp_file_names: _unlink_swallow_errors(temp_file_name)
def __new__( cls, ticks_started, ticks_succeeded, ticks_skipped, ticks_failed, ): return super(JobTickStatsSnapshot, cls).__new__( cls, ticks_started=check.int_param(ticks_started, "ticks_started"), ticks_succeeded=check.int_param(ticks_succeeded, "ticks_succeeded"), ticks_skipped=check.int_param(ticks_skipped, "ticks_skipped"), ticks_failed=check.int_param(ticks_failed, "ticks_failed"), )
def construct_secretsmanager_client(max_attempts: int, region_name: Optional[str] = None, profile_name: Optional[str] = None): check.int_param(max_attempts, "max_attempts") check.opt_str_param(region_name, "region_name") check.opt_str_param(profile_name, "profile_name") client_session = boto3.session.Session(profile_name=profile_name) secrets_manager = client_session.client( "secretsmanager", region_name=region_name, config=construct_boto_client_retry_config(max_attempts), ) return secrets_manager
def ephemeral_grpc_api_client( loadable_target_origin=None, force_port=False, max_retries=10, max_workers=None ): check.opt_inst_param(loadable_target_origin, "loadable_target_origin", LoadableTargetOrigin) check.bool_param(force_port, "force_port") check.int_param(max_retries, "max_retries") with GrpcServerProcess( loadable_target_origin=loadable_target_origin, force_port=force_port, max_retries=max_retries, max_workers=max_workers, lazy_load_user_code=True, ).create_ephemeral_client() as client: yield client
def open_server_process( port, socket, loadable_target_origin=None, max_workers=1, heartbeat=False, heartbeat_timeout=30, lazy_load_user_code=False, ): check.invariant((port or socket) and not (port and socket), "Set only port or socket") check.opt_inst_param(loadable_target_origin, "loadable_target_origin", LoadableTargetOrigin) check.int_param(max_workers, "max_workers") with seven.TemporaryDirectory() as temp_dir: output_file = os.path.join( temp_dir, "grpc-server-startup-{uuid}".format(uuid=uuid.uuid4().hex)) subprocess_args = ( [ loadable_target_origin.executable_path if loadable_target_origin and loadable_target_origin.executable_path else sys.executable, "-m", "dagster.grpc", ] + (["--port", str(port)] if port else []) + (["--socket", socket] if socket else []) + ["-n", str(max_workers)] + (["--heartbeat"] if heartbeat else []) + (["--heartbeat-timeout", str(heartbeat_timeout)] if heartbeat_timeout else []) + (["--lazy-load-user-code"] if lazy_load_user_code else []) + (["--ipc-output-file", output_file])) if loadable_target_origin: subprocess_args += loadable_target_origin.get_cli_args() server_process = open_ipc_subprocess(subprocess_args) try: wait_for_grpc_server(output_file) except: if server_process.poll() is None: server_process.terminate() raise return server_process
def __init__( self, retries: RetryMode, max_concurrent: int, start_method: Optional[str] = None, explicit_forkserver_preload: Optional[List[str]] = None, ): self._retries = check.inst_param(retries, "retries", RetryMode) max_concurrent = max_concurrent if max_concurrent else multiprocessing.cpu_count( ) self._max_concurrent = check.int_param(max_concurrent, "max_concurrent") start_method = check.opt_str_param(start_method, "start_method") valid_starts = multiprocessing.get_all_start_methods() if start_method is None: start_method = "spawn" if start_method not in valid_starts: raise DagsterUnmetExecutorRequirementsError( f"The selected start_method '{start_method}' is not available. " f"Only {valid_starts} are valid options on {sys.platform} python {sys.version}.", ) self._start_method = start_method self._explicit_forkserver_preload = explicit_forkserver_preload
def __new__(cls, max_retries=1, delay=None): experimental_class_warning("RetryPolicy") return super().__new__( cls, max_retries=check.int_param(max_retries, "max_retries"), delay=check.opt_numeric_param(delay, "delay"), )
def __new__(cls, tick_id, schedule_tick_data): return super(ScheduleTick, cls).__new__( cls, check.int_param(tick_id, 'tick_id'), check.inst_param(schedule_tick_data, 'schedule_tick_data', ScheduleTickData), )
def __new__( cls, run_config: Dict[str, object], pipeline_run: PipelineRun, run_id: str, retry_mode: RetryMode, step_key: str, recon_pipeline: ReconstructablePipeline, prior_attempts_count: int, known_state: Optional["KnownExecutionState"], run_group: Optional[Sequence[PipelineRun]], upstream_output_events: Optional[Sequence["EventLogEntry"]], ): from dagster.core.execution.plan.state import KnownExecutionState from dagster.core.storage.event_log import EventLogEntry return super(StepRunRef, cls).__new__( cls, check.dict_param(run_config, "run_config", key_type=str), check.inst_param(pipeline_run, "pipeline_run", PipelineRun), check.str_param(run_id, "run_id"), check.inst_param(retry_mode, "retry_mode", RetryMode), check.str_param(step_key, "step_key"), check.inst_param(recon_pipeline, "recon_pipeline", ReconstructablePipeline), check.int_param(prior_attempts_count, "prior_attempts_count"), check.opt_inst_param(known_state, "known_state", KnownExecutionState), check.opt_list_param(run_group, "run_group", of_type=PipelineRun), check.opt_list_param(upstream_output_events, "upstream_output_events", of_type=EventLogEntry), )
def __new__(cls, record_id, update_timestamp): return super(RunStatusSensorCursor, cls).__new__( cls, record_id=check.int_param(record_id, "record_id"), update_timestamp=check.str_param(update_timestamp, "update_timestamp"), )
def __new__(cls, tick_id, schedule_tick_data): return super(ScheduleTick, cls).__new__( cls, check.int_param(tick_id, "tick_id"), check.inst_param(schedule_tick_data, "schedule_tick_data", ScheduleTickData), )
def update_event_log_record(self, record_id, event): ''' Utility method for migration scripts to update SQL representation of event records. ''' check.int_param(record_id, 'record_id') check.inst_param(event, 'event', EventRecord) dagster_event_type = None if event.is_dagster_event: dagster_event_type = event.dagster_event.event_type_value with self.connect(run_id=event.run_id) as conn: conn.execute( SqlEventLogStorageTable.update() # pylint: disable=no-value-for-parameter .where(SqlEventLogStorageTable.c.id == record_id).values( event=serialize_dagster_namedtuple(event), dagster_event_type=dagster_event_type, timestamp=utc_datetime_from_timestamp(event.timestamp), step_key=event.step_key, ))
def get_run_records( self, filters: PipelineRunsFilter = None, limit: int = None, order_by: str = None, ascending: bool = False, ) -> List[RunRecord]: filters = check.opt_inst_param(filters, "filters", PipelineRunsFilter, default=PipelineRunsFilter()) limit = check.opt_int_param(limit, "limit") # only fetch columns we use to build RunRecord query = self._runs_query( filters=filters, limit=limit, columns=["id", "run_body", "create_timestamp", "update_timestamp"], order_by=order_by, ascending=ascending, ) rows = self.fetchall(query) return [ RunRecord( storage_id=check.int_param(row["id"], "id"), pipeline_run=deserialize_as( check.str_param(row["run_body"], "run_body"), PipelineRun), create_timestamp=check.inst(row["create_timestamp"], datetime), update_timestamp=check.inst(row["update_timestamp"], datetime), ) for row in rows ]
def __init__(self, instance, max_concurrent_runs): check.inst_param(instance, 'instance', DagsterInstance) self._delegate = SubprocessExecutionManager(instance) self._max_concurrent_runs = check.int_param(max_concurrent_runs, 'max_concurrent_runs') self._multiprocessing_context = get_multiprocessing_context() self._queue = self._multiprocessing_context.JoinableQueue(maxsize=0) gevent.spawn(self._clock)
def __init__(self, instance, interval_seconds): self._instance = check.inst_param(instance, "instance", DagsterInstance) self._logger = get_default_daemon_logger(type(self).__name__) self.interval_seconds = check.int_param(interval_seconds, "interval_seconds") self.last_iteration_time = None
def __init__( self, region, check_cluster_every=30, aws_access_key_id=None, aws_secret_access_key=None, ): """This object encapsulates various utilities for interacting with EMR clusters and invoking steps (jobs) on them. See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a resource for pyspark workloads. Args: region (str): AWS region to use check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates. Defaults to 30 seconds. aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will use the default boto3 credentials chain. aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which will use the default boto3 credentials chain. """ self.region = check.str_param(region, "region") # This is in seconds self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every") self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id") self.aws_secret_access_key = check.opt_str_param( aws_secret_access_key, "aws_secret_access_key" )
def __init__(self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None): super(RetryRequested, self).__init__() self.max_retries = check.int_param(max_retries, "max_retries") self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
def __new__(cls, solid_name, input_name, fan_in_index): return super(FanInInputPointer, cls).__new__( cls, check.str_param(solid_name, "solid_name"), check.str_param(input_name, "input_name"), check.int_param(fan_in_index, "fan_in_index"), )