def get_headers(): """Retrieves headers (e.g., user agent string) from environment variables Retrieves user agent string information to use in requests to third-party services. Args: N/A Returns: Headers dict for the requests library, in the form: {'User-Agent': '<user agent string>'} Raises: Exception: General exception, since scraper cannot proceed without this """ in_prod = environment.in_gcp() if not in_prod: user_agent_string = ( "For any issues, concerns, or rate constraints," "e-mail [email protected]" ) else: user_agent_string = secrets.get_secret("user_agent") if not user_agent_string: raise Exception("No user agent string") headers = {"User-Agent": user_agent_string} return headers
def __init__(self) -> None: prefix = "" if not in_gcp() else f"{project_id()}-" self.allowlist_path = GcsfsFilePath.from_absolute_path( f"{prefix}case-triage-data/allowlist_v2.json" ) self.allowed_users: List[str] = [] self.admin_users: List[str] = []
def __init__(self, write_key: str): is_local = not in_gcp() # When `send` is set to False, we do not send any logs to Segment. # We also set `debug` to True locally for more logging during development. self.client = Client( write_key, send=(not is_local), debug=is_local, )
def direct_ingest_storage_directory(self) -> GcsfsDirectoryPath: if in_gcp(): return gcsfs_direct_ingest_storage_directory_path_for_region( region_code=self.region_code, system_level=SystemLevel.STATE, ingest_instance=DirectIngestInstance.PRIMARY, ) # Local override return GcsfsDirectoryPath.from_absolute_path( f"recidiviz-staging-direct-ingest-state-storage/{self.region_code.lower()}" )
def init_engines_for_server_postgres_instances(cls) -> None: if not environment.in_gcp(): logging.info( "Environment is not GCP, not connecting to postgres instances." ) return cls.init_engine(SchemaType.JAILS) cls.init_engine(SchemaType.STATE) cls.init_engine(SchemaType.OPERATIONS) cls.init_engine(SchemaType.JUSTICE_COUNTS) cls.init_engine(SchemaType.CASE_TRIAGE)
def _regions_matching_environment(region_codes: Set[str]) -> Set[str]: """Filter to regions with the matching environment. If we are running locally, include all supported regions. """ if not environment.in_gcp(): return region_codes gcp_env = environment.get_gcp_environment() return { region_code for region_code in region_codes if regions.get_region(region_code).environment == gcp_env }
def _file_pointer_for_path(self, path: GcsfsFilePath, encoding: str) -> TextIO: """Returns a file pointer for the given path.""" # From the GCSFileSystem docs (https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem), # 'google_default' means we should look for local credentials set up via `gcloud login`. The project this is # reading from may have to match the project default you have set locally (check via `gcloud info` and set via # `gcloud config set project [PROJECT_ID]`. If we are running in the GCP environment, we should be able to query # the internal metadata for credentials. token = "google_default" if not environment.in_gcp() else "cloud" return self.gcs_file_system.open(path.uri(), encoding=encoding, token=token)
def get_engine_for_database( cls, database_key: SQLAlchemyDatabaseKey) -> Optional[Engine]: """Retrieve the engine for a given database. Will attempt to create the engine if it does not already exist.""" if database_key not in cls._engine_for_database: if not environment.in_gcp(): logging.info( "Environment is not GCP, not connecting to postgres instance for [%s].", database_key, ) return None cls.init_engine(database_key) return cls._engine_for_database.get(database_key, None)
def start_timers(self) -> None: """Starts store refresh timers for all stores that are a subclass of the AdminPanelStore class.""" if in_gcp() or in_development(): stores_with_timers = [ self.ingest_metadata_store, self.validation_metadata_store, self.ingest_data_freshness_store, self.validation_status_store, ] for store in stores_with_timers: RepeatedTimer(15 * 60, store.recalculate_store, run_immediately=True).start()
def store_validation_results( validation_results: List[ValidationResultForStorage], ) -> None: if not environment.in_gcp(): logging.info( "Skipping storing [%d] validation results in BigQuery.", len(validation_results), ) return bq_client = BigQueryClientImpl() bq_client.insert_into_table( bq_client.dataset_ref_for_id( VALIDATION_RESULTS_BIGQUERY_ADDRESS.dataset_id), VALIDATION_RESULTS_BIGQUERY_ADDRESS.table_id, [result.to_serializable() for result in validation_results], )
def create_cache_ingest_file_as_parquet_task( gcs_file: GcsfsFilePath, separator: str, encoding: str, quoting: int, custom_line_terminator: Optional[str], ) -> None: if in_gcp(): task_manager = ( AdminPanelDataDiscoveryCloudTaskManager() ) # type: AbstractAdminPanelDataDiscoveryCloudTaskManager else: task_manager = DevelopmentAdminPanelDataDiscoveryCloudTaskManager() task_manager.create_cache_ingest_file_as_parquet_task( gcs_file, separator, encoding, quoting, custom_line_terminator)
def get_proxies(use_test=False): """Retrieves proxy username/pass from environment variables Retrieves proxy information to use in requests to third-party services. If not in production environment, defaults to test proxy credentials (so problems during test runs don't risk our main proxy IP's reputation). Args: use_test: (bool) Use test proxy credentials, not prod Returns: Proxies dict for requests library, in the form: {'<protocol>': '<http://<proxy creds>@<proxy url>'} Raises: Exception: General exception, since scraper cannot proceed without this """ if not environment.in_gcp() or use_test: return None user_var = "proxy_user" pass_var = "proxy_password" proxy_url = secrets.get_secret("proxy_url") if proxy_url is None: raise Exception("No proxy url") # On the proxy side, a random ip is chosen for a session it has not seen # so collisions can still happen so we increase the integer to reduce the # odds. base_proxy_user = secrets.get_secret(user_var) proxy_user = PROXY_USER_TEMPLATE.format(base_proxy_user, random.random()) proxy_password = secrets.get_secret(pass_var) if (base_proxy_user is None) or (proxy_password is None): raise Exception("No proxy user/pass") proxy_credentials = proxy_user + ":" + proxy_password proxy_request_url = "http://" + proxy_credentials + "@" + proxy_url proxies = {"http": proxy_request_url, "https": proxy_request_url} return proxies
def setup() -> None: """Setup logging""" # Set the region on log records. logging.setLogRecordFactory(ContextualLogRecord) logger = logging.getLogger() # Send logs directly via the logging client if possible. This ensures trace # ids are propagated and allows us to send structured messages. if environment.in_gcp(): client = Client() structured_handler = StructuredAppEngineHandler(client) handlers.setup_logging(structured_handler, log_level=logging.INFO) before_request_handler = StructuredAppEngineHandler( client, name=BEFORE_REQUEST_LOG) logging.getLogger(BEFORE_REQUEST_LOG).addHandler( before_request_handler) # Streams unstructured logs to stdout - these logs will still show up # under the appengine.googleapis.com/stdout Stackdriver logs bucket, # even if other logs are stalled on the global interpreter lock or some # other issue. stdout_handler = logging.StreamHandler(sys.stdout) handlers.setup_logging(stdout_handler, log_level=logging.INFO) for handler in logger.handlers: if not isinstance( handler, (StructuredAppEngineHandler, logging.StreamHandler)): logger.removeHandler(handler) else: logging.basicConfig() for handler in logger.handlers: # If we aren't writing directly to Stackdriver, prefix the log with important # context that would be in the labels. if not isinstance(handler, StructuredAppEngineHandler): handler.setFormatter( logging.Formatter( "[pid: %(process)d] (%(region)s) %(module)s/%(funcName)s : %(message)s" )) # Export gunicorn errors using the same handlers as other logs, so that they # go to Stackdriver in production. gunicorn_logger = logging.getLogger("gunicorn.error") gunicorn_logger.handlers = logger.handlers
def __init__(self) -> None: self.database_key = SQLAlchemyDatabaseKey.for_schema(SchemaType.CASE_TRIAGE) prefix = "" if not in_gcp() else f"{project_id()}-" self.allowlist_path = GcsfsFilePath.from_absolute_path( f"{prefix}case-triage-data/allowlist_v2.json" ) self.feature_gate_path = GcsfsFilePath.from_absolute_path( f"{prefix}case-triage-data/feature_variants.json" ) self.case_triage_allowed_users: List[str] = [] self.case_triage_admin_users: List[str] = [] self.case_triage_demo_users: List[str] = [] # Map from feature name to a map of email addresses to variants # of the feature that they are in. self.feature_variants: Dict[str, Dict[str, FeatureGateInfo]] = {}
def _get_metadata(url: str) -> Optional[str]: if url in _metadata_cache: return _metadata_cache[url] if not allow_local_metadata_call: if environment.in_test() or not environment.in_gcp(): raise RuntimeError( "May not be called from test, should this have a local override?" ) try: r = requests.get(BASE_METADATA_URL + url, headers=HEADERS, timeout=TIMEOUT) r.raise_for_status() _metadata_cache[url] = r.text return r.text except Exception as e: logging.error("Failed to fetch metadata [%s]: [%s]", url, e) return None
def setup(): """Setup logging""" # Set the region on log records. default_factory = logging.getLogRecordFactory() logging.setLogRecordFactory(partial(region_record_factory, default_factory)) logger = logging.getLogger() # Send logs directly via the logging client if possible. This ensures trace # ids are propogated and allows us to send structured messages. if environment.in_gcp(): client = Client() handler = StructuredAppEngineHandler(client) handlers.setup_logging(handler, log_level=logging.INFO) # Streams unstructured logs to stdout - these logs will still show up # under the appengine.googleapis.com/stdout Stackdriver logs bucket, # even if other logs are stalled on the global interpreter lock or some # other issue. stdout_handler = logging.StreamHandler(sys.stdout) handlers.setup_logging(stdout_handler, log_level=logging.INFO) for handler in logger.handlers: if not isinstance( handler, (StructuredAppEngineHandler, logging.StreamHandler)): logger.removeHandler(handler) else: logging.basicConfig() for handler in logger.handlers: # If writing directly to Stackdriver, send a structured message. if isinstance(handler, StructuredAppEngineHandler): handler.setFormatter(StructuredLogFormatter()) # Otherwise, the default stream handler requires a string. else: handler.setFormatter( logging.Formatter( "(%(region)s) %(module)s/%(funcName)s : %(message)s")) # Export gunicorn errors using the same handlers as other logs, so that they # go to Stackdriver in production. gunicorn_logger = logging.getLogger("gunicorn.error") gunicorn_logger.handlers = logger.handlers
def _discovery_task() -> Tuple[str, int]: """Cloud task responsible for orchestrating ingest data parquet-ification tasks, loading parqueted files, and applying the DataDiscoveryArgs filters against the data Example: POST /admin/data_discovery/discovery_task Request Body: discovery_id: (string) The ID of this discovery task, as returned by /create_discovery Returns: N/A """ body = get_cloud_task_json_body() if in_gcp(): discover_data(body["discovery_id"]) else: # Run discovery in a thread locally threading.Thread(target=discover_data, args=[body["discovery_id"]]).start() return "", HTTPStatus.OK
def retry_grpc(num_retries: int, fn: Callable[..., ReturnType], *args: Any, **kwargs: Any) -> ReturnType: """Retries a function call some number of times""" time_to_sleep = random.uniform(5, RETRY_SLEEP) for i in range(num_retries + 1): try: return fn(*args, **kwargs) except exceptions.InternalServerError as e: if i == num_retries: raise if "GOAWAY" in str(e) or "Deadline Exceeded" in str(e): logging.exception("Received exception: ") if environment.in_gcp(): logging.warning("Sleeping %.2f seconds and retrying", time_to_sleep) time.sleep(time_to_sleep) continue else: raise raise exceptions.ServiceUnavailable( f"Function unsuccessful {num_retries + 1} times")
def get_data_discovery_cache() -> redis.Redis: """ Returns a client for the data discovery Redis instance. Redis commands can be issued directly to this client and all connection handling is done under inside `redis.Redis`. Idle connections will be closed by `redis.Redis` automatically. To get query cached data discovery information from the cache, you may want to provide this `Redis` instance to a `RedisCommunicator`, `DataDiscoveryArgsFactory`, or `SingleIngestFileParquetCache` class. """ if not in_gcp(): return redis.Redis() redis_host = get_secret("data_discovery_redis_host") redis_port = get_secret("data_discovery_redis_port") if redis_host and redis_port: return redis.Redis( host=redis_host, port=int(redis_port), ) raise ValueError("Cannot find data discovery redis secrets")
def _create_discovery() -> flask.Response: """Endpoint responsible for creating and enqueueing a new discovery task Example: POST /admin/data_discovery/create_discovery Request Body: JSON representation of the `DataDiscoveryArgs` data class Returns: JSON representation of the hydrated `DataDiscoveryArgs` data class """ data_discovery_args = DataDiscoveryArgsFactory.create(**request.get_json()) if in_gcp(): task_manager: AbstractAdminPanelDataDiscoveryCloudTaskManager = ( AdminPanelDataDiscoveryCloudTaskManager() ) else: task_manager = DevelopmentAdminPanelDataDiscoveryCloudTaskManager() task_manager.create_discovery_task(data_discovery_args) return jsonify(attr.asdict(data_discovery_args))
def add_ingest_ops_routes(bp: Blueprint, admin_stores: AdminStores) -> None: """Adds routes for ingest operations.""" project_id = GCP_PROJECT_STAGING if not in_gcp() else metadata.project_id() STATE_INGEST_EXPORT_URI = f"gs://{project_id}-cloud-sql-exports" @bp.route("/api/ingest_operations/fetch_ingest_state_codes", methods=["POST"]) @requires_gae_auth def _fetch_ingest_state_codes() -> Tuple[str, HTTPStatus]: all_state_codes = ( admin_stores.ingest_operations_store.state_codes_launched_in_env) state_code_info = fetch_state_codes(all_state_codes) return jsonify(state_code_info), HTTPStatus.OK # Start an ingest run for a specific instance @bp.route("/api/ingest_operations/<state_code_str>/start_ingest_run", methods=["POST"]) @requires_gae_auth def _start_ingest_run(state_code_str: str) -> Tuple[str, HTTPStatus]: state_code = _get_state_code_from_str(state_code_str) instance = request.json["instance"] admin_stores.ingest_operations_store.start_ingest_run( state_code, instance) return "", HTTPStatus.OK # Update ingest queues @bp.route( "/api/ingest_operations/<state_code_str>/update_ingest_queues_state", methods=["POST"], ) @requires_gae_auth def _update_ingest_queues_state( state_code_str: str) -> Tuple[str, HTTPStatus]: state_code = _get_state_code_from_str(state_code_str) new_queue_state = request.json["new_queue_state"] admin_stores.ingest_operations_store.update_ingest_queues_state( state_code, new_queue_state) return "", HTTPStatus.OK # Get all ingest queues and their state for given state code @bp.route("/api/ingest_operations/<state_code_str>/get_ingest_queue_states" ) @requires_gae_auth def _get_ingest_queue_states( state_code_str: str) -> Tuple[str, HTTPStatus]: state_code = _get_state_code_from_str(state_code_str) ingest_queue_states = (admin_stores.ingest_operations_store. get_ingest_queue_states(state_code)) return jsonify(ingest_queue_states), HTTPStatus.OK # Get summaries of all ingest instances for state @bp.route( "/api/ingest_operations/<state_code_str>/get_ingest_instance_summaries" ) @requires_gae_auth def _get_ingest_instance_summaries( state_code_str: str) -> Tuple[str, HTTPStatus]: state_code = _get_state_code_from_str(state_code_str) ingest_instance_summaries = (admin_stores.ingest_operations_store. get_ingest_instance_summaries(state_code)) return jsonify(ingest_instance_summaries), HTTPStatus.OK @bp.route("/api/ingest_operations/export_database_to_gcs", methods=["POST"]) @requires_gae_auth def _export_database_to_gcs() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"].upper()) db_version = ingest_instance.database_version( system_level=SystemLevel.STATE, state_code=state_code) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance) if not lock_manager.can_proceed(): return ( "other locks blocking ingest have been acquired; aborting operation", HTTPStatus.CONFLICT, ) db_key = SQLAlchemyDatabaseKey.for_state_code(state_code, db_version) cloud_sql_client = CloudSQLClientImpl(project_id=project_id) operation_id = cloud_sql_client.export_to_gcs_sql( db_key, GcsfsFilePath.from_absolute_path( f"{STATE_INGEST_EXPORT_URI}/{db_version.value}/{state_code.value}" ), ) if operation_id is None: return ( "Cloud SQL export operation was not started successfully.", HTTPStatus.INTERNAL_SERVER_ERROR, ) operation_succeeded = cloud_sql_client.wait_until_operation_completed( operation_id, seconds_to_wait=GCS_IMPORT_EXPORT_TIMEOUT_SEC) if not operation_succeeded: return ( "Cloud SQL import did not complete within 60 seconds", HTTPStatus.INTERNAL_SERVER_ERROR, ) return operation_id, HTTPStatus.OK @bp.route("/api/ingest_operations/import_database_from_gcs", methods=["POST"]) @requires_gae_auth def _import_database_from_gcs() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) db_version = SQLAlchemyStateDatabaseVersion( request.json["importToDatabaseVersion"].lower()) ingest_instance = DirectIngestInstance.for_state_database_version( database_version=db_version, state_code=state_code) exported_db_version = SQLAlchemyStateDatabaseVersion( request.json["exportedDatabaseVersion"].lower()) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST if db_version == SQLAlchemyStateDatabaseVersion.LEGACY: return "ingestInstance cannot be LEGACY", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance=ingest_instance) if not lock_manager.can_proceed(): return ( "other locks blocking ingest have been acquired; aborting operation", HTTPStatus.CONFLICT, ) db_key = SQLAlchemyDatabaseKey.for_state_code(state_code, db_version) cloud_sql_client = CloudSQLClientImpl(project_id=project_id) operation_id = cloud_sql_client.import_gcs_sql( db_key, GcsfsFilePath.from_absolute_path( f"{STATE_INGEST_EXPORT_URI}/{exported_db_version.value}/{state_code.value}" ), ) if operation_id is None: return ( "Cloud SQL import operation was not started successfully.", HTTPStatus.INTERNAL_SERVER_ERROR, ) operation_succeeded = cloud_sql_client.wait_until_operation_completed( operation_id, seconds_to_wait=GCS_IMPORT_EXPORT_TIMEOUT_SEC) if not operation_succeeded: return ( "Cloud SQL import did not complete within 60 seconds", HTTPStatus.INTERNAL_SERVER_ERROR, ) return operation_id, HTTPStatus.OK @bp.route("/api/ingest_operations/acquire_ingest_lock", methods=["POST"]) @requires_gae_auth def _acquire_ingest_lock() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"]) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance=ingest_instance) try: lock_manager.acquire_lock() except GCSPseudoLockAlreadyExists: return "lock already exists", HTTPStatus.CONFLICT if not lock_manager.can_proceed(): try: lock_manager.release_lock() except Exception as e: logging.exception(e) return ( "other locks blocking ingest have been acquired; releasing lock", HTTPStatus.CONFLICT, ) return "", HTTPStatus.OK @bp.route("/api/ingest_operations/release_ingest_lock", methods=["POST"]) @requires_gae_auth def _release_ingest_lock() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"]) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance=ingest_instance) try: lock_manager.release_lock() except GCSPseudoLockDoesNotExist: return "lock does not exist", HTTPStatus.NOT_FOUND return "", HTTPStatus.OK @bp.route("/api/ingest_operations/pause_direct_ingest_instance", methods=["POST"]) @requires_gae_auth def _pause_direct_ingest_instance() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"]) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST ingest_status_manager = DirectIngestInstanceStatusManager( region_code=state_code.value, ingest_instance=ingest_instance) try: ingest_status_manager.pause_instance() except Exception: return ( "something went wrong pausing the intance", HTTPStatus.INTERNAL_SERVER_ERROR, ) return "", HTTPStatus.OK
def test_in_prod_true(mock_os): mock_os.return_value = "production" assert environment.in_gcp()
def test_in_prod_false(mock_os): mock_os.return_value = "not production" assert not environment.in_gcp()
def should_persist() -> bool: """ Determines whether objects should be writed to the database in this context. """ return environment.in_gcp() or strtobool( (os.environ.get("PERSIST_LOCALLY", "false")))
from recidiviz.case_triage.util import get_local_secret, get_rate_limit_storage_uri from recidiviz.persistence.database.schema_utils import SchemaType from recidiviz.persistence.database.sqlalchemy_database_key import SQLAlchemyDatabaseKey from recidiviz.persistence.database.sqlalchemy_engine_manager import ( SQLAlchemyEngineManager, ) from recidiviz.tools.postgres import local_postgres_helpers from recidiviz.utils.auth.auth0 import ( Auth0Config, build_auth0_authorization_decorator, get_userinfo, ) from recidiviz.utils.environment import in_development, in_gcp, in_test from recidiviz.utils.timer import RepeatedTimer # Sentry setup if in_gcp(): sentry_sdk.init( # not a secret! dsn= "https://[email protected]/5623757", integrations=[FlaskIntegration()], # This value may need to be adjusted over time as usage increases. traces_sample_rate=1.0, ) # Flask setup static_folder = os.path.abspath( os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../frontends/case-triage/build/", ))
scrape_aggregate_reports_blueprint, url_prefix="/scrape_aggregate_reports" ) app.register_blueprint(store_single_count_blueprint, url_prefix="/single_count") app.register_blueprint(cloud_sql_to_bq_blueprint, url_prefix="/cloud_sql_to_bq") app.register_blueprint(backup_manager_blueprint, url_prefix="/backup_manager") app.register_blueprint(dataflow_monitor_blueprint, url_prefix="/dataflow_monitor") app.register_blueprint(validation_manager_blueprint, url_prefix="/validation_manager") app.register_blueprint( calculation_data_storage_manager_blueprint, url_prefix="/calculation_data_storage_manager", ) app.register_blueprint(reporting_endpoint_blueprint, url_prefix="/reporting") app.register_blueprint(export_blueprint, url_prefix="/export") app.register_blueprint(justice_counts_control, url_prefix="/justice_counts") if environment.in_gcp(): SQLAlchemyEngineManager.init_engines_for_server_postgres_instances() # Export traces and metrics to stackdriver if running in GCP if environment.in_gcp(): monitoring.register_stackdriver_exporter() trace_exporter = stackdriver_trace.StackdriverExporter( project_id=metadata.project_id(), transport=AsyncTransport ) trace_sampler = trace.CompositeSampler( { "/direct/process_job": samplers.AlwaysOnSampler(), # There are a lot of scraper requests, so they can use the default rate of 1 in 10k. "/scraper/": samplers.ProbabilitySampler(), "/scrape_aggregate_reports/": samplers.ProbabilitySampler(), },
def __init__(self) -> None: if in_development(): with local_project_id_override(GCP_PROJECT_STAGING): self._initialize_stores() elif in_gcp(): self._initialize_stores()