def get_data_from_sql(partition: table_batch, index: str, secrets: Dict[str, str]) -> Union[table_data, None]: stmt = f"""SELECT * FROM "{secrets['database_schema']}"."{partition.table_name}" ORDER BY "{index}" ASC OFFSET %(offset)s LIMIT %(limit)s""" get_logger().debug(stmt) con = get_rds_engine(secrets=secrets) df = pd.read_sql(sql=stmt, con=con, params=dict(offset=partition.offset, limit=partition.limit)) if len(df) == 0: get_logger().warning( f'Unable to locate any filename_list in table_name: {partition.table_name} OFFSET: {partition.offset} LIMIT: {partition.limit}' ) return None assert len(df.columns) == len( partition.data_types ), f'Mismatched columns in table_name: {partition.table_name}' # ensure we're using the approprite date-types for parquet # this is especially important for columns which allow NULL in Postgres, and NaN in Pandas for c in df.columns: assert c in partition.data_types, f"Missing data_type for query: {c}" # Since pandas represents timestamps in nanosecond resolution # the timespan that can be represented using a 64-bit integer is limited to approximately 584 years # ref: http://pandas-docs.github.io/pandas-docs-travis/user_guide/timeseries.html#timeseries-timestamp-limits if partition.data_types[c] in ['datetime64[ms]'] and df[c].min: # if it's outside min/max, set to NaT df[c] = pd.to_datetime( arg=df[c], errors='coerce', unit='ms', origin='unix', ) else: df[c] = df[c].astype(partition.data_types[c]) # create the placeholders for partitioning the parquet file later on assert 'created_at' in df.columns, f'Missing column "created_at" in table_name: {partition.table_name}' df['year'] = df['created_at'].dt.year.astype(np.int16) df['month'] = df['created_at'].dt.month.astype(np.int8) df['day'] = df['created_at'].dt.day.astype(np.int8) ntf = NamedTemporaryFile(prefix=f"{partition.table_name}_", suffix='.parquet', delete=False) filename = Path(ntf.name) df.to_parquet(path=filename, engine='pyarrow', compression='snappy', index=False) con.dispose() return table_data(partition.table_name, filename)
def flatten_nested_list( nested_list: List[List[Union[table_batch, table_data]]], max_concurrent_connections: int = 0 ) -> List[Union[table_batch, table_data]]: results = [item for sublist in nested_list for item in sublist] get_logger().info( f"Discovered: {len(results)} total records from nested_list: {len(nested_list)}" ) return results
def prepare_table_data_for_parquet_directory( grouped_table_data: List[table_data], first_index: int, num_of_records_in_batch: int, destination_directory: str = None) -> Union[List[table_data], None]: total_records = 0 if len(grouped_table_data) == 0: get_logger().warning('Unable to locate any data to insert...') return None table_name = grouped_table_data[-1].table_name total_partitions = len(grouped_table_data) directory = Path(destination_directory or mkdtemp(suffix=f"_{table_name}")) # if this is an incremental load, we should download the _metadata files from s3 locally # such that the appends will update that correctly # if first_index != 0: # s3 = get_s3_connection(secrets=secrets) # for filename in ['_metadata', '_common_metadata']: # # _metadata must always exist locallay # path = Path(secrets['s3_bucket'] + s3.sep + table_name + s3.sep + filename) # s3.get(rpath=path.as_posix(), lpath=(directory / filename).as_posix()) for i, data in enumerate(grouped_table_data): if data is None: get_logger().debug( "Unable to locate a filename_list object to attempt an insert") continue # the first record needs to re-create the parquet filename if starting_index == 0 and i == 0 # everything else should append to that parquet filename one-at-a-time recreate_metadata = (first_index == 0) and (i == 0) size = pandas_to_local_parquet( directory=directory, data=data, num_of_records_in_batch=num_of_records_in_batch, append=not recreate_metadata, idx=f"{i+1:05}/{total_partitions:05}") if size == 0: get_logger().warning( f"Unable to upload contents of file: {data.filename}") total_records += size get_logger().info( f"Prepared: {total_records} from table_name: {table_name}") # return list of all files under this directory results = [ table_data(table_name, _) for _ in directory.glob('**/*') if _.is_file() ] # assert sorted(results)[0].filename.as_posix().endswith('_metadata'), 'Missing _metadata file in directory!' get_logger().info( f"Discovered {len(results)} parquet files to upload for {table_name}") return results
def purge_transient_folders(filename_list: List[List[table_data]]) -> bool: data = filename_list[-1] if data is None: get_logger().warning(f"Unable to locate any folders to purge") return False directory = get_parent_folder_name(data=data) get_logger().info(f"Purging directory: {directory}") rmtree(directory) return True
def pandas_to_local_parquet(directory: os.PathLike, data: table_data, num_of_records_in_batch: int, append: bool = True, idx: str = '') -> int: df = pd.read_parquet(path=data.filename) if len(df) == 0: get_logger().warning( f"Unable to locate any filename_list in file: {data.filename}") return 0 partition_cols = ['year', 'month', 'day'] for p in partition_cols: assert p in df.columns, f'Missing column "{p}", cannot continue with S3 upload' get_logger().info( f"[{idx}] Attempting to prepare {len(df)} records, {data.filename.stat().st_size/(1024*1024)} MB from file: {data.filename}" ) # df.to_parquet( # # path=f"s3://{path}", # path=directory, # engine='pyarrow', # compression='snappy', # partition_cols=partition_cols, # index=False, # allow_truncated_timestamps=True, # # flavor='spark', # # filesystem=s3 # ) # use Dask to write the _metadata and _common_metadata files # write to local disk first, then use aws cli to sync the filename to s3 # TODO: ref https://github.com/dask/dask/issues/6867 dd.from_pandas(data=df, chunksize=num_of_records_in_batch).to_parquet( # path=f"s3://{path}", path=directory, append=append, engine='pyarrow', compression='snappy', partition_on=partition_cols, ignore_divisions=True, # storage_options=dict( # anon=False, # key=secrets['s3_access_key'], # secret=secrets['s3_secret_key'], # use_ssl=True, # client_kwargs=dict( # endpoint_url=secrets['s3_server'], # ) # ), write_index=False) return len(df)
def get_table_data_types( table: str, secrets: Dict[str, str], ) -> Dict[str, Any]: # ensure Pandas has the correct filename_list-type as SQL stmt = """SELECT column_name, is_nullable, data_type, udt_name FROM information_schema.columns WHERE table_catalog = %(database)s AND table_schema = %(schema)s AND table_name = %(table_name)s ORDER BY ordinal_position ASC""" get_logger().debug(stmt) con = get_rds_engine(secrets=secrets) dt = pd.read_sql(sql=stmt, con=con, params=dict(database=secrets['database_name'], schema=secrets['database_schema'], table_name=table)) dt['is_nullable'] = dt['is_nullable'].apply(lambda _: _ == 'YES') get_logger().info(f"Discovered {len(dt)} columns for table_name: {table}") mapper = dict() # we need to convert datatype to something that can hold NaN (i.e. Floats) for idx, row in dt.iterrows(): if row.data_type == 'bigint': mapper[ row.column_name] = np.float64 if row.is_nullable else np.int64 elif row.data_type == 'integer': mapper[ row.column_name] = np.float32 if row.is_nullable else np.int32 elif row.data_type == 'smallint': mapper[ row.column_name] = np.float32 if row.is_nullable else np.int16 elif row.data_type == 'boolean': mapper[row.column_name] = 'bool' elif row.data_type in ['double precision', 'numeric']: mapper[row.column_name] = np.float64 elif row.udt_name in ['timestamp', 'date']: mapper[row.column_name] = 'datetime64[ms]' elif row.udt_name in ['varchar', 'text', 'json', 'jsonb']: mapper[row.column_name] = str else: raise RuntimeError(f'Unknown data_type: {row.data_type}') con.dispose() return mapper
def wait_on_visible(driver: RemoteWebDriver, xpath: str, timeout: int = 60): try: resolved = WebDriverWait(driver, timeout=timeout).until( EC.visibility_of_element_located((By.XPATH, xpath)) ) return resolved except (TimeoutException, ) as ex: get_logger().error(f'URL: {driver.current_url} unable to locate XPATH: {xpath} in timeout: {timeout}') raise ex except (InvalidSelectorException, ) as ex: raise ex except (NoSuchElementException, ElementNotVisibleException, InvalidElementStateException, ) as ex: get_logger().error(f'URL: {driver.current_url} unable to locate XPATH: {xpath}') raise ex
def sync_with_s3(data: table_data, secrets: Dict[str, str]) -> Path: s3 = get_s3_connection(secrets=secrets) directory = get_parent_folder_name(data=data) destination = data.filename.as_posix().split(directory.as_posix())[-1] assert len( destination ) > 0, f'Unable to determine destination from directory: {directory} filename: {data.filename}' path = Path(secrets['s3_bucket'] + s3.sep + data.table_name + s3.sep + destination).as_posix() get_logger().debug( f"Upload {data.filename.stat().st_size/(1024*1024)} MB from file: {data.filename} to {path}" ) s3.put(lpath=data.filename.as_posix(), rpath=path) return data.filename
def click_on_xpath(driver: RemoteWebDriver, xpath: str, timeout: int = 60): time.sleep(random.uniform(0.5, 1.)) try: resolved = WebDriverWait(driver, timeout=timeout).until( EC.element_to_be_clickable((By.XPATH, xpath)) ) resolved.click() return resolved except (TimeoutException, ) as ex: get_logger().error(f'Unable to locate element: {xpath} within {timeout} seconds') raise ex except (InvalidSelectorException, ) as ex: raise ex except (NoSuchElementException, ElementNotVisibleException, InvalidElementStateException, ) as ex: raise ex
def __init__(self, state_handlers: Iterable[Callable] = None): if state_handlers is not None and not isinstance( state_handlers, collections.Sequence ): raise TypeError("state_handlers should be iterable.") self.state_handlers = state_handlers or [] self.logger = logging.get_logger(type(self).__name__)
def identify_s3_files_to_purge( table_name: str, first_index: int, secrets: Dict[str, str]) -> Union[List[str], None]: s3 = get_s3_connection(secrets=secrets) path = Path(secrets['s3_bucket'] + s3.sep + table_name).as_posix() if first_index > 0: get_logger().warning( f"starting_index: {first_index} implies we don't want to purge any existing data from s3 for: {path}" ) return [] # results = s3.glob(path=path + s3.sep + '**' + s3.sep + '*metadata') results = s3.glob(path=path + s3.sep + '**' + s3.sep + '*.parquet') get_logger().info(f"Discovered {len(results)} files to purge from {path}") return results
def test_temporary_config_sets_and_resets(caplog): with temporary_logger_config( level=logging.CRITICAL, stream_fmt="%(message)s", stream_datefmt="%H:%M:%S", ): logger = get_logger() assert logger.level == logging.CRITICAL for handler in logger.handlers: if isinstance(handler, logging.StreamHandler): assert handler.formatter._fmt == "%(message)s" assert handler.formatter.datefmt == "%H:%M:%S" logger.info("Info log not shown") logger.critical("Critical log shown") logger.info("Info log shown") for handler in logger.handlers: handler.flush() output = caplog.text assert "Info log not shown" not in output assert "Critical log shown" in output assert "Info log shown" in output assert logger.level == logging.DEBUG for handler in logger.handlers: if isinstance(handler, logging.StreamHandler): assert handler.formatter._fmt != "%(message)s" assert handler.formatter.datefmt != "%H:%M:%S"
def test_temporary_config_resets_on_exception(caplog): with pytest.raises(ValueError): with temporary_logger_config(level=logging.CRITICAL, ): raise ValueError() logger = get_logger() assert logger.level == logging.DEBUG
def __init__(self, private_registry: bool = False, docker_secret: str = None) -> None: self.identifier_label = str(uuid.uuid4()) self.private_registry = private_registry self.docker_secret = docker_secret or "DOCKER_REGISTRY_CREDENTIALS" self.logger = logging.get_logger("CloudEnvironment")
def ensure_oauth_access_token(self): """Retrieves OAuth 2.0 access token using the client credentials grant and stores it in the request session.""" logger = get_logger() now = datetime.utcnow() if self._expires_at is None or now >= self._expires_at: logger.info('Token is expired or missing, requesting a new one.') data = { 'grant_type': 'client_credentials', 'client_id': self.client_id, 'client_secret': self.client_secret, 'token_type': self.token_type, } response = requests.post( self.auth_url, data=data, hooks={ 'response': log_response_hook } ) data = response.json() self._session.auth = SuppliedAuth(data['access_token'], data.get('token_type', self.token_type)) self._expires_at = now + timedelta(seconds=data['expires_in']) logger.info("Acquired a token that expires at {}".format(self._expires_at.isoformat()))
def get_s3_connection(secrets: Dict[str, str]) -> S3FileSystem: # ref: https://s3fs.readthedocs.io/en/latest/#credentials conn = S3FileSystem(anon=False, key=secrets['s3_access_key'], secret=secrets['s3_secret_key'], use_ssl=True, client_kwargs=dict( endpoint_url=secrets['s3_server'], )) # verify the bucket exists if not conn.exists(secrets['s3_bucket']): get_logger().warning( f"Unable to lcoate bucket, will attempt to create it now: {secrets['s3_bucket']}" ) conn.mkdirs(secrets['s3_bucket']) return conn
def log_response_hook(response, *args, **kwargs): # pylint: disable=unused-argument """Log summary information about every request made.""" logger = get_logger() logger.info( "[{}] [{}] [{}] {}".format( response.request.method, response.status_code, response.elapsed.total_seconds(), response.url ) )
def __init__( self, name: str, schedule: prefect.schedules.Schedule = None, environment: Environment = None, storage: Storage = None, tasks: Iterable[Task] = None, edges: Iterable[Edge] = None, reference_tasks: Iterable[Task] = None, state_handlers: List[Callable] = None, on_failure: Callable = None, validate: bool = None, result_handler: ResultHandler = None, ): self._cache = {} # type: dict if not name: raise ValueError("A name must be provided for the flow.") self.name = name self.logger = logging.get_logger("Flow: {}".format(self.name)) self.schedule = schedule self.environment = environment or prefect.environments.RemoteEnvironment( ) self.storage = storage self.result_handler = ( result_handler or prefect.engine.get_default_result_handler_class()()) self.tasks = set() # type: Set[Task] self.edges = set() # type: Set[Edge] self.constants = collections.defaultdict( dict) # type: Dict[Task, Dict[str, Any]] for t in tasks or []: self.add_task(t) self.set_reference_tasks(reference_tasks or []) for e in edges or []: self.add_edge( upstream_task=e.upstream_task, downstream_task=e.downstream_task, key=e.key, mapped=e.mapped, validate=validate, ) self._prefect_version = prefect.__version__ if state_handlers and not isinstance(state_handlers, collections.Sequence): raise TypeError("state_handlers should be iterable.") self.state_handlers = state_handlers or [] if on_failure is not None: self.state_handlers.append( callback_factory(on_failure, check=lambda s: s.is_failed())) super().__init__()
def __init__(self, value: Any = None, bucket: str = None, credentials_secret: str = None, **kwargs: Any) -> None: self.bucket = bucket self.credentials_secret = credentials_secret self.logger = logging.get_logger(type(self).__name__) super().__init__(value, **kwargs)
def task_filter_links( links: T.Union[T.List[str], Result], gaming_platform: T.Union[str, Parameter], tbl: T.Union[sa.Table, Result]) -> T.Union[T.List[str], Result]: """ Remove any links which we have 'recently' scraped """ stmt = sa.select([tbl.c.source_url]).where( sa.and_( tbl.c.platform == gaming_platform, tbl.c.source_url.in_(links), # tbl.c.scraped_on > datetime.datetime.utcnow() - datetime.timedelta(days=1) )) rp = tbl.bind.execute(stmt) results = set([_[0] for _ in rp.fetchall()]) output = list(set(links).difference(results)) get_logger().info(f'Discovered {len(output)} links to parse') return output
def default_logger(): handler = logging.StreamHandler() handler.setFormatter(DatadogFormatter()) logger = get_logger() logger.addHandler(handler) logger.info(f'Beginning Flow run for \'{prefect.context.flow_name}\'') logger.info(f'Task \'{prefect.context.task_name}\': Starting task run...')
def __init__( self, api_key_id: str, api_token: str, ) -> None: self.api_key_id = api_key_id self.api_token = api_token self.logger = get_logger() self._api_url = "https://api.getmontecarlo.com/graphql"
def __init__( self, labels: Iterable[str] = None, on_start: Callable = None, on_exit: Callable = None, ) -> None: self.labels = set(labels) if labels else set() self.on_start = on_start self.on_exit = on_exit self.logger = logging.get_logger(type(self).__name__)
def load_and_run_flow() -> None: """ Loads a flow (and the corresponding environment), then runs the flow with the environment. This is useful for environments whose `execute` method schedules a job that later needs to run the flow. Raises: - ValueError: if no `flow_run_id` is found in context """ logger = logging.get_logger("Environment") try: flow_run_id = prefect.context.get("flow_run_id") if not flow_run_id: raise ValueError("No flow run ID found in context.") query = { "query": { with_args("flow_run", {"where": { "id": { "_eq": flow_run_id } }}): { "flow": { "name": True, "storage": True }, } } } client = Client() result = client.graphql(query) flow_run = result.data.flow_run[0] flow_data = flow_run.flow storage_schema = prefect.serialization.storage.StorageSchema() storage = storage_schema.load(flow_data.storage) # populate global secrets secrets = prefect.context.get("secrets", {}) for secret in storage.secrets: secrets[secret] = prefect.tasks.secrets.PrefectSecret( name=secret).run() with prefect.context(secrets=secrets): flow = storage.get_flow(flow_data.name) flow.environment.run(flow) except Exception as exc: logger.exception( "Unexpected error raised during flow run: {}".format(exc)) raise exc
def create_data_partitions(table_name: str, first_index: int, last_index: int, secrets: Dict[str, str], num_of_records_in_batch: int) -> List[table_batch]: """Create partition for table_name based on number of days in the date range""" dt = get_table_data_types(table=table_name, secrets=secrets) stmt = f"""SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname=%(table_name)s ORDER BY reltuples DESC LIMIT 1""" get_logger().debug(stmt) con = get_rds_engine(secrets=secrets) df = pd.read_sql(sql=stmt, con=con, params=dict(table_name=table_name)) row_estimate = df.iloc[0]['estimate'] + num_of_records_in_batch get_logger().info( f"Row estimate for table_name: {table_name} {row_estimate}") rows_to_pull = last_index if 0 < last_index < row_estimate else row_estimate assert first_index < row_estimate, f'starting_index: {first_index} is greater-than rows_to_pull: {rows_to_pull}' batch_size = last_index - first_index if 0 < last_index < num_of_records_in_batch else num_of_records_in_batch # TODO: validate batch size directory = Path(mkdtemp(suffix=f"_{table_name}")) table_partitions = [ table_batch(table_name, dt, i, batch_size, directory) for i in range(first_index, rows_to_pull, batch_size) ] get_logger().info( f"Created {len(table_partitions)} partitions from table_name: {table_name}" ) con.dispose() return table_partitions
def run_with_thread_timeout( fn: Callable, args: Sequence = (), kwargs: Mapping = None, timeout: int = None, logger: Logger = None, name: str = None, ) -> Any: """ Helper function for implementing timeouts on function executions. Implemented by setting a `signal` alarm on a timer. Must be run in the main thread. Args: - fn (callable): the function to execute - args (Sequence): arguments to pass to the function - kwargs (Mapping): keyword arguments to pass to the function - timeout (int): the length of time to allow for execution before raising a `TimeoutError`, represented as an integer in seconds - logger (Logger): an optional logger to use. If not passed, a logger for the `prefect.executors.run_with_thread_timeout` namespace will be created. - name (str): an optional name to attach to logs for this function run, defaults to the name of the given function. Provides an interface for passing task names for logs. Returns: - the result of `fn(*args, **kwargs)` Raises: - TimeoutError: if function execution exceeds the allowed timeout - ValueError: if run from outside the main thread """ logger = logger or get_logger() name = name or f"Function '{fn.__name__}'" kwargs = kwargs or {} if timeout is None: return fn(*args, **kwargs) def error_handler(signum, frame): # type: ignore raise TimeoutError("Execution timed out.") try: # Set the signal handler for alarms signal.signal(signal.SIGALRM, error_handler) # Raise the alarm if `timeout` seconds pass logger.debug(f"{name}: Sending alarm with {timeout}s timeout...") signal.alarm(timeout) logger.debug(f"{name}: Executing function in main thread...") return fn(*args, **kwargs) finally: signal.alarm(0)
def run(self) -> None: logger = get_logger("threaded_heartbeat") client = Client() iter_count = 0 with prefect.context( {"flow_run_id": self.flow_run_id, "running_with_backend": True} ): with log_heartbeat_failure(logger): while iter_count < (self.num or 1) and ( self.stop_event.is_set() is False ): send_heartbeat(self.flow_run_id, client, logger) iter_count += 1 if self.num else 0 self.stop_event.wait(timeout=config.cloud.heartbeat_interval)
def __init__( self, value: Any = None, result_handler: ResultHandler = None, validators: Iterable[Callable] = None, run_validators: bool = True, location: str = None, ): self.value = value self.safe_value = NoResult # type: SafeResult self.result_handler = result_handler # type: ignore self.validators = validators self.run_validators = run_validators self.location = location self.logger = logging.get_logger(type(self).__name__)
def __init__( self, min_workers: int = 1, max_workers: int = 2, private_registry: bool = False, docker_secret: str = None, ) -> None: self.min_workers = min_workers self.max_workers = max_workers self.identifier_label = str(uuid.uuid4()) self.private_registry = private_registry if self.private_registry: self.docker_secret = docker_secret or "DOCKER_REGISTRY_CREDENTIALS" else: self.docker_secret = None # type: ignore self.logger = logging.get_logger("CloudEnvironment")
def multiprocessing_safe_run_and_retrieve( queue: multiprocessing.Queue, payload: bytes, ) -> None: """ Gets the return value from a function and puts it in a multiprocessing-safe container. Helper function for `run_with_multiprocess_timeout`, must be defined top-level so it can be pickled and sent to `multiprocessing.Process` Passing the payload serialized allows us to escape the limitations of the python native pickler which will fail on tasks defined in scripts because of name mismatches. Whilst this particular example only affects the `func` arg, any of the others could be affected by other pickle limitations as well. Args: - queue (multiprocessing.Queue): The queue to pass the resulting payload to - payload (bytes): A serialized dictionary containing the data required to run the function. Should be serialized with `cloudpickle.dumps` Expects the following keys: - fn (Callable): The function to call - args (list): Positional argument values to call the function with - kwargs (Mapping): Keyword arguments to call the function with - context (dict): The prefect context dictionary to use during execution - name (str): an optional name to attach to logs for this function run, defaults to the name of the given function. Provides an interface for passing task names for logs. - logger (Logger): the logger to use """ request = cloudpickle.loads(payload) fn: Callable = request["fn"] context: dict = request.get("context", {}) args: Sequence = request.get("args", []) kwargs: Mapping = request.get("kwargs", {}) name: str = request.get("name", f"Function '{fn.__name__}'") logger: Logger = request.get("logger") or get_logger() try: with prefect.context(context): logger.debug(f"{name}: Executing...") return_val = fn(*args, **kwargs) except Exception as exc: return_val = exc logger.debug(f"{name}: Passing result back to main process...") queue.put(cloudpickle.dumps(return_val))