def start_sync_run( self, sync_id: int, wait_for_completion: bool, wait_time_between_api_calls: int, max_wait_time: int, ) -> Dict: """ Start a new sync run. Optionally, wait for run completion. The sync run is triggered via the [Start new sync run API](https://hightouch.io/docs/syncs/api/#start-a-new-sync-run). Args: - sync_id (int): The sync identifier. - wait_for_completion (bool): Whether to wait for the sync run completion or not. - wait_time_between_api_calls (int): The number of seconds to wait between API calls. This is used only if `wait_for_completion` is `True`. - max_wait_time (int): The maximum number of seconds to wait for the sync run to complete. This is used only if `wait_for_completion` is `True`. Raises: - `prefect.engine.signals.FAIL` if the sync run takes more than `max_wait_time` seconds to complete. Returns: - If `wait_for_completion` is `True`, returns the JSON response containing the status of the sync run. - If `wait_for_completion` is `False`, returns the JSON response containing information about the start sync run action. """ url = f"{self.__HIGHTOUCH_START_NEW_SYNC_RUN_URL}/{sync_id}" with self.session.post(url) as response: if response.status_code != 200: msg = f"Error while starting sync run. Error is: {response.reason}." raise FAIL(message=msg) start_sync_response = response.json() if wait_for_completion: elapsed_wait_time = 0 sync_status = None while not max_wait_time or elapsed_wait_time <= max_wait_time: sync_status_response = self.get_sync_run_status( sync_id=sync_id) sync_status = sync_status_response["sync"]["sync_status"] if sync_status == "success": return sync_status_response else: time.sleep(wait_time_between_api_calls) elapsed_wait_time += wait_time_between_api_calls msg = "Sync run exceeded `max_wait_time`" raise FAIL(message=msg) else: return start_sync_response
def run( self, job_name: str = None, job_definition: str = None, job_queue: str = None, batch_kwargs: dict = None, credentials: str = None, ): """ Submit a job to the AWS Batch job service. Args: - job_name (str, optional): The AWS batch job name. - job_definition (str, optional): The AWS batch job definition. - job_queue (str, optional): Name of the AWS batch job queue. - batch_kwargs (dict, optional): Additional keyword arguments to pass to the boto3 `submit_job` function. See the [submit_job](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html#Batch.Client.submit_job) # noqa documentation for more details. - credentials (dict, optional): your AWS credentials passed from an upstream Secret task; this Secret must be a JSON string with two keys: `ACCESS_KEY` and `SECRET_ACCESS_KEY` which will be passed directly to `boto3`. If not provided here or in context, `boto3` will fall back on standard AWS rules for authentication. """ if not job_name: raise ValueError("A job name must be provided.") if not job_definition: raise ValueError("A job definition must be provided.") if not job_queue: raise ValueError("A job queue must be provided.") if not batch_kwargs: batch_kwargs = {} batch_client = get_boto_client("batch", credentials=credentials, **self.boto_kwargs) try: response = batch_client.submit_job( jobName=job_name, jobQueue=job_queue, jobDefinition=job_definition, **batch_kwargs, ) except Exception as e: self.logger.error("Failed to submit job", exc_info=True) raise FAIL( f"Failed to submit job '{job_name}' to AWS Batch.") from e if not response.get("jobId"): raise FAIL( f"AWS Batch submit response contains no job ID: {response}") return response["jobId"]
def run( self, bucket_name: str = None, blob: str = None, project: str = None, wait_seconds: int = 0, fail_if_not_found: bool = True, credentials: dict = None, request_timeout: Union[float, Tuple[float, float]] = 60, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - bucket_name (str, optional): the bucket to check - blob (str, optional): object for which to search within the bucket - project (str, optional): default Google Cloud project to work within. If not provided, will be inferred from your Google Cloud credentials - wait_seconds(int, optional): retry until file is found or until wait_seconds, whichever is first. Defaults to 0 - fail_if_not_found (bool, optional): Will raise Fail signal on task if blob is not found. Defaults to True - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - request_timeout (Union[float, Tuple[float, float]], optional): the number of seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). Returns: - bool: the object exists Raises: - ValueError: if `bucket_name` or `blob` are missing - FAIL: if object not found and fail_if_not_found is True """ if None in [bucket_name, blob]: raise ValueError("Missing bucket_name or blob") # create client client = get_storage_client(project=project, credentials=credentials) bucket = client.bucket(bucket_name) blob_exists = None wait, n = 0, 1 while wait <= wait_seconds and not blob_exists: sleep(n) wait += n n *= 2 blob_exists = storage.Blob(bucket=bucket, name=blob).exists(client) if fail_if_not_found and not blob_exists: raise FAIL(message="Blob not found") return blob_exists
def test_submission_fail(self, batch_client): batch_client.submit_job = MagicMock(side_effect=FAIL()) task = BatchSubmit(job_definition="job_def", job_name="job_name", job_queue="queue123") with pytest.raises(FAIL, match="Failed to submit job 'job_name'"): task.run()
def run( self, container_id: str = None, docker_server_url: str = "unix:///var/run/docker.sock", raise_on_exit_code: bool = True, ) -> None: """ Task run method. Args: - container_id (str, optional): The id of a container to wait on - docker_server_url (str, optional): URL for the Docker server. Defaults to `unix:///var/run/docker.sock` however other hosts such as `tcp://0.0.0.0:2375` can be provided - raise_on_exit_code (bool, optional): whether to raise a `FAIL` signal for a nonzero exit code; defaults to `True` Returns: - dict: a dictionary with `StatusCode` and `Error` keys Raises: - ValueError: if `container_id` is `None` - FAIL: if `raise_on_exit_code` is `True` and the container exits with a nonzero exit code """ if not container_id: raise ValueError("A container id must be provided.") # 'import docker' is expensive time-wise, we should do this just-in-time to keep # the 'import prefect' time low import docker self.logger.debug( "Starting to wait on container with id {}".format(container_id) ) client = docker.APIClient(base_url=docker_server_url, version="auto") result = client.wait(container=container_id) if raise_on_exit_code and ( (result.get("Error") is not None) or result.get("StatusCode") ): try: logs = client.logs(container_id) self.logger.error(logs.decode()) except Exception as exc: self.logger.exception(exc) raise FAIL( "{id} failed with exit code {code}: {msg}".format( id=container_id, code=result.get("StatusCode"), msg=result.get("Error"), ) ) self.logger.debug( "Completed waiting on container with id {}".format(container_id) ) return result
def _get_data_from_url(self, api_url: str, params: Dict) -> Dict: """ Retrieve data from a Cube.js API. Args: - api_url (str): The URL of the Cube API to call. - params (dict): Parameters to be passed to the API call. Raises: - `prefect.engine.signals.FAIL` if the response has `status_code != 200`. - `prefect.engine.signals.FAIL` if the REST APIs takes too long to respond, with regards to `max_wait_time`. Returns: - Cube.js REST API JSON response """ session = Session() session.headers = { "Content-type": "application/json", "Authorization": self.api_token, } elapsed_wait_time = 0 while not self.max_wait_time or elapsed_wait_time <= self.max_wait_time: with session.get(url=api_url, params=params) as response: if response.status_code == 200: data = response.json() if "error" in data.keys( ) and "Continue wait" in data["error"]: time.sleep(self.wait_api_call_secs) elapsed_wait_time += self.wait_api_call_secs continue else: return data else: raise FAIL( message= f"Cube.js load API failed! Error is: {response.reason}" ) msg = f"Cube.js load API took longer than {self.max_wait_time} seconds to provide a response." raise FAIL(message=msg)
def run( self, container_id: str = None, docker_server_url: str = "unix:///var/run/docker.sock", raise_on_exit_code: bool = True, extra_docker_kwargs: dict = None, ) -> None: """ Task run method. Args: - container_id (str, optional): The id of a container to wait on - docker_server_url (str, optional): URL for the Docker server. Defaults to `unix:///var/run/docker.sock` however other hosts such as `tcp://0.0.0.0:2375` can be provided - raise_on_exit_code (bool, optional): whether to raise a `FAIL` signal for a nonzero exit code; defaults to `True` - extra_docker_kwargs (dict, optional): Extra keyword arguments to pass through to the Docker call (cf. method `wait`). See https://docker-py.readthedocs.io/en/stable/api.html for more details Returns: - dict: a dictionary with `StatusCode` and `Error` keys Raises: - ValueError: if `container_id` is `None` - FAIL: if `raise_on_exit_code` is `True` and the container exits with a nonzero exit code """ if not container_id: raise ValueError("A container id must be provided.") # 'import docker' is expensive time-wise, we should do this just-in-time to keep # the 'import prefect' time low import docker self.logger.debug(f"Waiting on container {container_id}") client = docker.APIClient(base_url=docker_server_url, version="auto") result = client.wait(container=container_id, **(extra_docker_kwargs or dict())) if raise_on_exit_code and ((result.get("Error") is not None) or result.get("StatusCode")): try: logs = client.logs(container_id) self.logger.error(logs.decode()) except Exception as exc: self.logger.exception(exc) raise FAIL("{id} failed with exit code {code}: {msg}".format( id=container_id, code=result.get("StatusCode"), msg=result.get("Error"), )) self.logger.debug(f"Container {container_id} has finished") return result
def get_sync_run_status(self, sync_id: int) -> Dict: """ Return the status of a sync run. The status is obtained by calling the [Get sync run status API](https://hightouch.io/docs/syncs/api/#get-the-status-of-a-sync-run). Args: - sync_id (int): The sync identifier. Raises: - `prefect.engine.signals.FAIL` if the response status code is not 200. Returns: - The JSON response containing information about the status of the sync run. """ url = f"{self.__HIGHTOUCH_GET_SYNC_RUN_STATUS}/{sync_id}" with self.session.get(url) as response: if response.status_code != 200: msg = f"Error while retrieving sync run status. Error is: {response.reason}." raise FAIL(message=msg) return response.json()
def run( self, api_key: str = None, api_key_env_var: str = None, mql_server_url: str = None, mql_server_url_env_var: str = None, model_key_id: int = None, materialization_name: str = None, start_time: str = None, end_time: str = None, output_table: str = None, force: bool = False, wait_for_creation: bool = True, ): """ Task run method to create a materialization against a Transform metrics layer deployment. All parameters can be provided either during task initialization or directly in this `run` method. Args: - api_key (str, optional): Transform API Key to be used to connect to Transform MQL Server. - api_key_env_var (str, optional): The name of the environment variable that contains the API Key to be used to connect to Transform MQL Server. - mql_server_url (str, optional): The URL of the Transform MQL Server from which to create the materialization. - mql_server_url_env_var (str, optional): The name of the environment variable that contains the URL of the Transform MQL Server from which to create the materialization. - model_key_id (int, optional): The unique identifier of the Transform model against which the transformation will be created. - materialization_name (str, optional): The name of the Transform materialization to create. - start_time (str, optional): The UTC start time of the materialization. - end_time (str, optional): The ISO end time of the materialization. - output_table (str, optional): The name of the database table, in the form of `schema_name.table_name`, where the materialization will be created. - force (bool, optional): Whether to force the materialization creation or not. Defaults to `False`. - wait_for_creation (bool, optional): Whether to wait for the materialization creation or not. Defaults to `True`. Raises: - `ValueError` if both `api_key` and `api_key_env_var` are missing. - `ValueError` if both `mql_server_url` and `mql_server_url_env_var` are missing. - `ValueError` if `materialization_name` is missing. - `prefect.engine.signals.FAIL` if the connection with the Transform server cannot be established. - `prefect.engine.signals.FAIL` if the materialization creation process fails. Returns: - An `MqlQueryStatusResp` object if `run_async` is `True`. - An `MqlMaterializeResp` object if `run_async` is `False`. """ # Raise error if both api_key and api_key_env_var are missing if not (api_key or api_key_env_var): msg = "Both `api_key` and `api_key_env_var` are missing." raise ValueError(msg) # Raise error if api_key is missing and env var is not found if not api_key and api_key_env_var not in os.environ.keys(): msg = "`api_key` is missing and `api_key_env_var` not found in env vars." raise ValueError(msg) mql_api_key = api_key or os.environ[api_key_env_var] # Raise error if both mql_server_url and mql_server_url_env_var are missing if not (mql_server_url or mql_server_url_env_var): msg = "Both `mql_server_url` and `mql_server_url_env_var` are missing." raise ValueError(msg) # Raise error if mql_server_url is missing and env var is not found if not mql_server_url and mql_server_url_env_var not in os.environ.keys( ): msg = "`mql_server_url` is missing and `mql_server_url_env_var` not found in env vars." raise ValueError(msg) mql_url = mql_server_url or os.environ[mql_server_url_env_var] if not materialization_name: msg = "`materialization_name` is missing." raise ValueError(msg) use_async = not wait_for_creation try: mql_client = MQLClient(api_key=mql_api_key, mql_server_url=mql_url, use_async=use_async) except (AuthException, URLException) as e: msg = f"Cannot connect to Transform server! Error is: {e.msg}" raise FAIL(message=msg) response = None if use_async: response = mql_client.create_materialization( materialization_name=materialization_name, start_time=start_time, end_time=end_time, model_key_id=model_key_id, output_table=output_table, force=force, ) if response.is_failed: msg = f"Transform materialization async creation failed! Error is: {response.error}" raise FAIL(message=msg) else: try: response = mql_client.materialize( materialization_name=materialization_name, start_time=start_time, end_time=end_time, model_key_id=model_key_id, output_table=output_table, force=force, ) except QueryRuntimeException as e: msg = ( f"Transform materialization sync creation failed! Error is: {e.msg}" ) raise FAIL(message=msg) return response
def run( self, uri: str = None, dataset_id: str = None, table: str = None, project: str = None, schema: List[bigquery.SchemaField] = None, location: str = "US", credentials: dict = None, **kwargs, ): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - uri (str, optional): GCS path to load data from - dataset_id (str, optional): the id of a destination dataset to write the records to; if not provided here, will default to the one provided at initialization - table (str, optional): the name of a destination table to write the records to; if not provided here, will default to the one provided at initialization - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - schema (List[bigquery.SchemaField], optional): the schema to use when creating the table - location (str, optional): location of the dataset that will be written to; defaults to "US" - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - **kwargs (optional): additional kwargs to pass to the `bigquery.LoadJobConfig`; see the documentation here: https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html Raises: - ValueError: if all required arguments haven't been provided - ValueError: if the load job results in an error Returns: - google.cloud.bigquery.job.LoadJob: the response from `load_table_from_uri` """ # check for any argument inconsistencies if dataset_id is None or table is None: raise ValueError("Both dataset_id and table must be provided.") # create client client = get_bigquery_client(project=project, credentials=credentials) # get table reference table_ref = client.dataset(dataset_id).table(table) # load data autodetect = kwargs.pop("autodetect", True) job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs) if schema: job_config.schema = schema try: load_job = client.load_table_from_uri( uri, table_ref, location=location, job_config=job_config, ).result() # block until job is finished except Exception as exception: for error in load_job.errors: self.logger(error) raise FAIL(exception) # remove unpickleable attributes load_job._client = None load_job._completion_lock = None return load_job
def third_task(): from prefect.engine.signals import FAIL raise FAIL(message=my_logger("This is sensitive data"))
def transform(data): """Multiply the input by 10""" raise FAIL("I am a failure")
def trythis(x, fail=False): if fail: raise FAIL() return [k + 1 for k in x]
def run( self, airbyte_server_host: str = None, airbyte_server_port: int = None, airbyte_api_version: str = None, connection_id: str = None, poll_interval_s: int = 15, ) -> dict: """ Task run method for triggering an Airbyte Connection. *It is assumed that the user will have previously configured a Source & Destination into a Connection.* e.g. MySql -> CSV An invocation of `run` will attempt to start a sync job for the specified `connection_id` representing the Connection in Airbyte. `run` will poll Airbyte Server for the Connection status and will only complete when the sync has completed or when it receives an error status code from an API call. Args: - airbyte_server_host (str, optional): Hostname of Airbyte server where connection is configured. Will overwrite the value provided at init if provided. - airbyte_server_port (str, optional): Port that the Airbyte server is listening on. Will overwrite the value provided at init if provided. - airbyte_api_version (str, optional): Version of Airbyte API to use to trigger connection sync. Will overwrite the value provided at init if provided. - connection_id (str, optional): if provided, will overwrite the value provided at init. - poll_interval_s (int, optional): this task polls the Airbyte API for status, if provided this value will override the default polling time of 15 seconds. Returns: - dict: connection_id (str) and succeeded_at (timestamp str) """ if not connection_id: raise ValueError("Value for parameter `connection_id` *must* \ be provided.") try: uuid.UUID(connection_id) except (TypeError, ValueError): raise ValueError( "Parameter `connection_id` *must* be a valid UUID \ i.e. 32 hex characters, including hyphens.") # see https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com # /rapidoc-api-docs.html#overview airbyte_base_url = (f"http://{airbyte_server_host}:" f"{airbyte_server_port}/api/{airbyte_api_version}") session = requests.Session() self._check_health_status(session, airbyte_base_url) self.logger.info( f"Getting Airbyte Connection {connection_id}, poll interval " f"{poll_interval_s} seconds, airbyte_base_url {airbyte_base_url}") connection_status = self._get_connection_status( session, airbyte_base_url, connection_id) if connection_status == self.CONNECTION_STATUS_ACTIVE: # Trigger manual sync on the Connection ... job_id, job_created_at = self._trigger_manual_sync_connection( session, airbyte_base_url, connection_id) job_status = self.JOB_STATUS_PENDING while job_status not in [ self.JOB_STATUS_FAILED, self.JOB_STATUS_SUCCEEDED ]: job_status, job_created_at, job_updated_at = self._get_job_status( session, airbyte_base_url, job_id) # pending┃running┃incomplete┃failed┃succeeded┃cancelled if job_status == self.JOB_STATUS_SUCCEEDED: self.logger.info(f"Job {job_id} succeeded.") elif job_status == self.JOB_STATUS_FAILED: self.logger.error(f"Job {job_id} failed.") else: # wait for next poll interval sleep(poll_interval_s) return { "connection_id": connection_id, "status": connection_status, "job_status": job_status, "job_created_at": job_created_at, "job_updated_at": job_updated_at, } elif connection_status == self.CONNECTION_STATUS_INACTIVE: self.logger.error( f"Please enable the Connection {connection_id} in Airbyte Server." ) raise FAIL( f"Please enable the Connection {connection_id} in Airbyte Server." ) elif connection_status == self.CONNECTION_STATUS_DEPRECATED: self.logger.error(f"Connection {connection_id} is deprecated.") raise FAIL(f"Connection {connection_id} is deprecated.")
def run( self, client: str = None, waiter_name: str = None, waiter_definition: dict = None, waiter_kwargs: dict = None, credentials: str = None, ): """ Task for waiting on a long-running AWS job. Uses the underlying boto3 waiter functionality. Args: - client (str): The AWS client on which to wait (e.g., 'batch', 'ec2', etc) - waiter_name (str, optional): The name of the waiter to instantiate. Can be a boto-supported waiter or one of prefect's custom waiters. Currently, prefect offers three additional waiters for AWS Batch: `"JobExists"` waits for a job to be instantiated, `"JobRunning"` waits for a job to start running, and `"JobComplete"` waits for a job to finish. You can find the definitions for all prefect-defined waiters [here](https://github.com/PrefectHQ/prefect/tree/master/src/prefect/tasks/aws/waiters). # noqa You may also use a custom waiter name, if you supply an accompanying waiter definition dict. - waiter_definition (dict, optional): A valid custom waiter model, as a dict. Note that if you supply a custom definition, it is assumed that the provided 'waiter_name' is contained within the waiter definition dict. - waiter_kwargs (dict, optional): Arguments to pass to the `waiter.wait(...)` method. Will depend upon the specific waiter being called. - credentials (dict, optional): your AWS credentials passed from an upstream Secret task; this Secret must be a JSON string with two keys: `ACCESS_KEY` and `SECRET_ACCESS_KEY` which will be passed directly to `boto3`. If not provided here or in context, `boto3` will fall back on standard AWS rules for authentication. """ if not client: raise ValueError("An AWS client string must be provided.") if not waiter_name: raise ValueError("A waiter name must be provided.") if not waiter_kwargs: waiter_kwargs = {} boto_client = get_boto_client(client, credentials=credentials, **self.boto_kwargs) if waiter_definition: # Use user-provided waiter definition waiter_model = WaiterModel(waiter_definition) waiter = create_waiter_with_client(waiter_name, waiter_model, boto_client) else: # Use either boto-provided or prefect-provided waiter if waiter_name in boto_client.waiter_names: waiter = boto_client.get_waiter(waiter_name) else: waiter = self._load_prefect_waiter(boto_client, client, waiter_name) try: waiter.wait(**waiter_kwargs) except WaiterError as e: raise FAIL( f"AWS {client} waiter '{waiter_name}' failed with: {str(e)}" ) from e
def run( self, api_secret: str = None, api_secret_env_var: str = None, from_date: str = None, to_date: str = None, limit: int = None, event: Union[str, List[str]] = None, where: str = None, parse_response: bool = False, use_eu_server: bool = False, group_events: bool = False, ): """ Task run method to request a data export from Mixpanel using the Export API. Args: - api_secret (str, optional): The API secret key to use to authenticate to Mixpanel. Can be provided also via env var. - api_secret_env_var (str, optional): The name of the env var that contains the API secret key to use to authenticate to Mixpanel. `api_secret` takes precedence over `api_secret_env_var`. - from_date (str, optional): Start date of the export request. If provided as a string, it should be in the format `YYYY-MM-DD`. Default value is `2011-07-10`. This date is inclusive. - to_date (str, optional): End date of the export request. If provided as a string, it should be in the format `YYYY-MM-DD`. Default value is `prefect.context.today`. This date is inclusive. - limit (int, optional): The max number of events to return. - event (str, list, optional): The event, or events, that you wish to get the data for. - where (str, optional): An expression to filter events by. More info on expression sequence structure can be found at https://developer.mixpanel.com/reference/segmentation-expressions. - parse_response (bool, optional): Whether to parse the response into a JSON object. Default value is `False`. - use_eu_server (bool, optional): Whether to use the Mixpanel EU server to retrieve data. More info at https://help.mixpanel.com/hc/en-us/articles/360039135652-Data-Residency-in-EU. Default is `False`. - group_events: Whether to group events with the same name. This is taken into account only if `parse_response is True`. Returns: - if `parse_response` is False, then returns a `str` response pulled from the Export API, (which is basically a JSONL string). - if `parse_response` is True and `group_events` is True, then returns a `dict` where each key contains homogeneous events. - if `parse_response` is True and `group_events` is False, then returns a `list` of JSON objects obtained by parsing the response. Raises: - `ValueError` if both `api_secret` and `api_secret_env_var` are missing. - `ValueError` if `api_secret` is missing and `api_secret_env_var` is not found. - `prefect.engine.signals.FAIL` if the Mixpanel API returns an error. """ if not api_secret and not api_secret_env_var: raise ValueError( "Missing both `api_secret` and `api_secret_env_var`.") elif not api_secret and api_secret_env_var not in os.environ: raise ValueError( "Missing `api_secret` and `api_secret_env_var` not found.") secret = None if api_secret: self.logger.debug("Got secret from `api_secret`") secret = api_secret else: self.logger.debug( "Got secret from env var passed from `api_secret_env_var`") secret = os.environ[api_secret_env_var] params = {"from_date": from_date, "to_date": to_date} if limit: params["limit"] = limit if event: params["event"] = json.dumps( [event] if isinstance(event, str) else event) if where: params["where"] = where url = "https://{server}.mixpanel.com/api/2.0/export".format( server="data-eu" if use_eu_server else "data") response = requests.get( url=url, auth=HTTPBasicAuth(secret, ""), headers={"Accept": "application/json"}, params=params, ) if response.status_code != 200: msg = f""" Mixpanel export API error. Status code: {response.status_code} Reason: {response.reason} Text: {response.text} """ raise FAIL(message=msg) events = response.text if not events: return None elif parse_response: received_events = [ json.loads(event) for event in events.splitlines() ] if group_events: grouped_events = defaultdict(list) for received_event in received_events: grouped_events[received_event["event"]].append( received_event["properties"]) return dict(grouped_events) return received_events else: return events
def run( self, subdomain: str = None, url: str = None, api_secret: str = None, api_secret_env_var: str = "CUBEJS_API_SECRET", query: Union[Dict, List[Dict]] = None, security_context: Union[str, Dict] = None, wait_time_between_api_calls: int = 10, max_wait_time: int = None, ): """ Task run method to perform a query using Cube.js load API. Args: - subdomain (str, optional): The subdomain to use to get the data. If provided, `subdomain` takes precedence over `url`. This is likely to be useful to Cube Cloud users. - url (str, optional): The URL to use to get the data. This is likely to be useful to users of self-hosted Cube.js. - api_secret (str, optional): The API secret used to generate an API token for authentication. If provided, it takes precedence over `api_secret_env_var`. - api_secret_env_var (str, optional): The name of the env var that contains the API secret to use to generate an API token for authentication. Defaults to `CUBEJS_API_SECRET`. - query (dict, list, optional): `dict` or `list` representing valid Cube.js queries. If you pass multiple queries, then be aware of Cube.js Data Blending. More info at https://cube.dev/docs/rest-api#api-reference-v-1-load and at https://cube.dev/docs/schema/advanced/data-blending. Query format can be found at: https://cube.dev/docs/query-format. - security_context (str, dict, optional): The security context to use during authentication. If the security context does not contain an expiration period, then a 7-day expiration period is added automatically. More info at https://cube.dev/docs/security/context. - wait_time_between_api_calls (int, optional): The number of seconds to wait between API calls. Default to 10. - max_wait_time (int, optional): The number of seconds to wait for the Cube.js load API to return a response. Raises: - ValueError if both `subdomain` and `url` are missing. - ValueError if `api_token` is missing and `api_token_env_var` cannot be found. - ValueError if `query` is missing. - `prefect.engine.signals.FAIL` if the Cube.js load API fails. - `prefect.engine.signals.FAIL` if the Cube.js load API takes more than `max_wait_time` seconds to respond. Returns: - The Cube.js JSON response. """ if not subdomain and not url: raise ValueError("Missing both `subdomain` and `url`.") if not api_secret and api_secret_env_var not in os.environ: raise ValueError( "Missing `api_secret` and `api_secret_env_var` not found.") if not query: raise ValueError("Missing `query`.") cube_base_url = self.__CUBEJS_CLOUD_BASE_URL if subdomain: cube_base_url = f"{cube_base_url.format(subdomain=subdomain)}/cubejs-api" else: cube_base_url = url query_api_url = f"{cube_base_url}/v1/load" self.logger.debug(f"Query URL: {query_api_url}") secret = api_secret if api_secret else os.environ[api_secret_env_var] if security_context: extended_context = security_context if "exp" not in security_context and "expiresIn" not in security_context: extended_context["expiresIn"] = "7d" api_token = jwt.encode(payload=extended_context, key=secret, algorithm="HS256") self.logger.debug("JWT token generated with security context.") else: api_token = jwt.encode(payload={}, key=secret) session = Session() session.headers = { "Content-type": "application/json", "Authorization": api_token, } params = {"query": json.dumps(query)} wait_api_call_secs = (wait_time_between_api_calls if wait_time_between_api_calls > 0 else 10) elapsed_wait_time = 0 while not max_wait_time or elapsed_wait_time <= max_wait_time: with session.get(url=query_api_url, params=params) as response: self.logger.debug(f"URL is: {response.url}") if response.status_code == 200: data = response.json() if "error" in data.keys( ) and "Continue wait" in data["error"]: msg = ( "Cube.js load API still running." "Waiting {wait_api_call_secs} seconds before retrying" ) self.logger.info(msg) time.sleep(wait_api_call_secs) elapsed_wait_time += wait_api_call_secs continue else: return data else: raise FAIL( message= f"Cube.js load API failed! Error is: {response.reason}" ) raise FAIL( message= f"Cube.js load API took longer than {max_wait_time} seconds to provide a response." )
def run( self, subdomain: str = None, email_address: str = None, api_token: str = None, api_token_env_var: str = None, start_time: Union[int, datetime] = None, cursor: str = None, exclude_deleted: bool = None, include_entities: List[str] = None, ): """ Task run method to perform an incremental export of tickets from Zendesk. Args: - subdomain (str, optional): The Zendesk subdomain to use to export tickets. - email_address (str, optional): The email address to use to authenticate on Zendesk. - api_token (str, optional): The API token to use to athenticate on Zendesk If passed, it will take precedence over `api_token_env_var`. - api_token_env_var (str, optional): The name of the env var which contains the API token to use to authenticate on Zendesk. - start_time (int, datetime, optional): The start time to use to export tickets. Can be passed as an epoch timestamp or a `datetime` object. - cursor (str, optional): The cursor to use to export tickets. If passed, it will take precedence over `start_time`. - exclude_deleted: (bool, optional): Whether to exclude deleted tickets or not. Default to `False`. - include_entities: (str, list, optional): Optional list of entities to side load. More info at https://developer.zendesk.com/documentation/ticketing/using-the-zendesk-api/side_loading/. Raises: - `ValueError` if both `api_token` and `api_token_env_var` are missing. - `ValueError` if `api_token` is missing and `api_token_env_var` cannot be found. - `ValueError` if `subdomain` is missing. - `ValueError` if `email_address` is missing. - `ValueError` if both `start_time` and `cursor` are missing. - `prefect.engine.signals.FAIL` if the Zendesk API call fails. Returns: - A `dict` containing the list of tickets and, optionally, the included entities. """ if not api_token and not api_token_env_var: raise ValueError( "Both `api_token` and `api_token_env_var` are missing.") if not api_token and api_token_env_var not in os.environ: raise ValueError( "`api_token` is missing and `api_token_env_var` not found.") token = None if api_token: token = api_token elif api_token_env_var: token = os.environ[api_token_env_var] if not subdomain: raise ValueError("`subdomain` is missing.") if not email_address: raise ValueError("`email_address` is missing.") if not start_time and not cursor: raise ValueError("Both `start_time` and `cursor` are missing.") base_url = self._ZENDESK_API_BASE_URL.format(subdomain=subdomain) export_url = f"{base_url}/incremental/tickets/cursor.json" if cursor: self.logger.debug("Got cursor") export_url = f"{export_url}?cursor={cursor}" elif start_time: self.logger.debug("Got start_time") start_datetime = (start_time if isinstance(start_time, int) else int(start_time.timestamp())) export_url = f"{export_url}?start_time={start_datetime}" if exclude_deleted: export_url = f"{export_url}&exclude_deleted=true" if include_entities: if isinstance(include_entities, str): include_entities_str = include_entities elif isinstance(include_entities, list): include_entities_str = ",".join(list(set(include_entities))) export_url = f"{export_url}&include={include_entities_str}" session = requests.Session() session.auth = f"{email_address}/token", token end_of_stream = False tickets = defaultdict(list) while not end_of_stream: with session.get(export_url) as response: self.logger.debug(f"Export URL is: {export_url}") if response.status_code == 429: retry_after_seconds = int(response.headers["retry-after"]) self.logger.warning(f""" API rate limit reached! Waiting for {retry_after_seconds} seconds before retrying. """) time.sleep(retry_after_seconds + 1) continue elif response.status_code != 200: msg = f""" Zendesk API call failed! Status: {response.status_code} Reason: {response.reason} """ raise FAIL(message=msg) content = response.json() tickets["tickets"].extend(content["tickets"]) if include_entities: for include_entity in list(set(include_entities)): if include_entity in content.keys(): tickets[include_entity].extend( content[include_entity]) end_of_stream = content["end_of_stream"] export_url = content["after_url"] cursor = content["after_cursor"] if not end_of_stream: # Try to avoid the rate limit: 10 requests per minute time.sleep(0.1) return tickets
def do_something(): raise FAIL("test")
def i_am_bad(): raise FAIL("Oh no!")
def run( self, server_uri: str = None, user: str = None, password: str = None, db_name: str = None, server_uri_env_var: str = None, user_env_var: str = None, password_env_var: str = None, db_name_env_var: str = None, cypher_query: str = None, return_result_as: str = __DEFAULT_RETURN_RESULT_TYPE, ): """ Task run method to run a Cypher query against Neo4j. Args: - server_uri (str, optional): The Neo4j URI to connect to. More information regarding the accepted forms for `server_uri` can be found at https://py2neo.org/2021.1/profiles.html. This parameter, if provided, takes precedence over `server_uri_env_var`. - user (str, optional): The user to use to connect to Neo4j. This parameter, if provided, takes precedence over `user_env_var`. - password (str, optional): The password to use to connect to Neo4j. This parameter, if provided, takes precedence over `password_env_var`. - db_name: (str, optional): The database name where the Cypher query will run. This parameter, if provided, takes precedence over `db_name_env_var`. - server_uri_env_var (str, optional): The name of the environment variable that contains the Neo4j server URI to connect to. - user_env_var (str, optional): The name of the environment variable that contains the user to use to connect to Neo4j. - password_env_var (str, optional): The name of the environment variable that contains the password to use to connect to Neo4j. - db_name_env_var (str, optional): The name of the environment variable that contains the database name where the Cypher query will run. - cypher_query (str, optional): The Cypher query to run. More information about the Cypher query language, can be found at https://neo4j.com/developer/cypher/. - return_result_as (str, optional): How to return the result. Accepted values are `raw`, `dataframe`. Defaults to `raw` (which will return a `list` of `dict`). Applies only when the query returned result is not empty. Returns: - `None` if the query result is empty. - The original result if `return_result_as` is `raw`. - A `pandas.DataFrame` if `return_result_as` is `dataframe`. Raises: - `ValueError` if both `server_uri` and `server_uri_env_var` are `None`. - `ValueError` if `server_uri` is `None` and `server_uri_env_var` is not found. - `ValueError` if both `user` and `user_env_var` are `None`. - `ValueError` if `user` is `None` and `user_env_var` is not found. - `ValueError` if both `password` and `password_env_var` are not found. - `ValueError` if `password` is `None` and `password_env_var` is not found. - `ValueError` if `db_name` is `None` and `db_name_env_var` is not found. - `ValueError` if `cypher_query` is `None`. - `ValueError` if `return_result_as` is not one of `raw`, `dataframe`. - `prefect.engine.signals.FAIL` if any error occurs while establishing the connection with Neo4j. - `prefect.engine.signals.FAIL` if any error occurs while running the Cypher query. """ if not server_uri and not server_uri_env_var: msg = "Please provide either the `server_uri` or the `server_uri_env_var`." raise ValueError(msg) if not server_uri and server_uri_env_var not in os.environ.keys(): msg = f"`{server_uri_env_var}` not found in environment variables." raise ValueError(msg) neo4j_uri = server_uri or os.environ[server_uri_env_var] if not user and not user_env_var: msg = "Please provide either the `user` or the `user_env_var`." raise ValueError(msg) if not user and user_env_var not in os.environ.keys(): msg = f"`{user_env_var}` not found in environment variables." raise ValueError(msg) neo4j_user = user or os.environ[user_env_var] if not password and not password_env_var: msg = "Please provide either the `password` or the `password_env_var`." raise ValueError(msg) if not password and password_env_var not in os.environ.keys(): msg = f"`{password_env_var}` not found in environment variables." raise ValueError(msg) neo4j_password = password or os.environ[password_env_var] neo4j_db_name = None if db_name: neo4j_db_name = db_name elif db_name_env_var and db_name_env_var not in os.environ.keys(): msg = f"`{db_name_env_var}` not found in environment variables." raise ValueError(msg) elif db_name_env_var and db_name_env_var in os.environ.keys(): neo4j_db_name = os.environ[db_name_env_var] if not cypher_query: raise ValueError("Please provide a value for `cypher_query`.") if return_result_as not in self.__ACCEPTED_RETURN_RESULT_TYPES: msg = f"Illegal value for `return_result_as`. Illegal value is: {return_result_as}." raise ValueError(msg) try: graph = Graph(profile=neo4j_uri, name=neo4j_db_name, auth=(neo4j_user, neo4j_password)) except ConnectionUnavailable as e: msg = f"Error while connecting to Neo4j. Exception: {str(e)}" raise FAIL(message=msg) try: r = graph.run(cypher_query) except ClientError as e: msg = f"Error while running Cypher query. Exception: {str(e)}" raise FAIL(message=msg) result = r.data() if not result: return None elif return_result_as == "dataframe": return r.to_data_frame() else: return result