def test_transform(schema, actual, expected, expected_warns, capsys): t = TypeTransformer(TransformConfig.DefaultSchemaNormalization) t.transform(actual, schema) assert json.dumps(actual) == json.dumps(expected) stdout = capsys.readouterr().out if expected_warns: assert expected_warns in stdout else: assert not stdout
def test_transform(schema, actual, expected, expected_warns, caplog): t = TypeTransformer(TransformConfig.DefaultSchemaNormalization) t.transform(actual, schema) assert json.dumps(actual) == json.dumps(expected) if expected_warns: record = caplog.records[0] assert record.name == "airbyte" assert record.levelname == "WARNING" assert record.message == expected_warns else: assert len(caplog.records) == 0
def test_source_config_transform(abstract_source, catalog): logger_mock = MagicMock() streams = abstract_source.streams(None) http_stream, non_http_stream = streams http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) non_http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}] records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})] assert len(records) == 2 assert [r.record.data for r in records] == [{"value": "23"}] * 2
class NotAStream: transformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) @transformer.registerCustomTransform def transform_cb(instance, schema): pass
class BaseLookerStream(HttpStream, ABC): """Base looker class""" transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) @property def primary_key(self) -> Optional[Union[str, List[str]]]: return None def __init__(self, *, domain: str, **kwargs: Any): self._domain = domain super().__init__(**kwargs) @property def authenticator(self) -> TokenAuthenticator: if self._session.auth: return self._session.auth return super().authenticator @property def url_base(self) -> str: return f"https://{self._domain}/api/{API_VERSION}/" def next_page_token(self, response: requests.Response, **kwargs: Any) -> Optional[Mapping[str, Any]]: return None
class AirtableStream(HttpStream, ABC): url_base = "https://api.airtable.com/v0/" primary_key = "id" transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) def __init__(self, base_id: str, table_name: str, schema, **kwargs): super().__init__(**kwargs) self.base_id = base_id self.table_name = table_name self.schema = schema @property def name(self): return self.table_name def get_json_schema(self) -> Mapping[str, Any]: return self.schema def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: json_response = response.json() offset = json_response.get("offset", None) if offset: return {"offset": offset} return None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: if next_page_token: return next_page_token return {} def process_records(self, records): for record in records: data = record.get("fields", {}) processed_record = { "_airtable_id": record.get("id"), "_airtable_created_time": record.get("createdTime"), **data } yield processed_record def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: json_response = response.json() records = json_response.get("records", []) records = self.process_records(records) yield from records def path(self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> str: return f"{self.base_id}/{self.table_name}"
class NotAStream: transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization | TransformConfig.DefaultSchemaNormalization) @transformer.registerCustomTransform def transform_cb(instance, schema): # Check default conversion applied assert instance == "12" assert schema == SIMPLE_SCHEMA["properties"]["value"] return "transformed"
class Transactions(PaypalTransactionStream): """List Paypal Transactions on a specific date range API Docs: https://developer.paypal.com/docs/integration/direct/transaction-search/#list-transactions Endpoint: /v1/reporting/transactions """ data_field = "transaction_details" primary_key = [["transaction_info", "transaction_id"]] cursor_field = ["transaction_info", "transaction_initiation_date"] transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization) # TODO handle API error when 1 request returns more than 10000 records. # https://github.com/airbytehq/airbyte/issues/4404 records_per_request = 10000 def path(self, **kwargs) -> str: return "transactions" def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: decoded_response = response.json() total_pages = decoded_response.get("total_pages") page_number = decoded_response.get("page") if page_number < total_pages: return {"page": page_number + 1} else: return None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: page_number = 1 if next_page_token: page_number = next_page_token.get("page") return { "start_date": stream_slice["start_date"], "end_date": stream_slice["end_date"], "fields": "all", "page_size": self.page_size, "page": page_number, } @transformer.registerCustomTransform def transform_function(original_value: Any, field_schema: Dict[str, Any]) -> Any: if isinstance(original_value, str) and field_schema["type"] == "number": return float(original_value) elif isinstance(original_value, str) and field_schema["type"] == "integer": return int(original_value) else: return original_value
class FreshserviceStream(HttpStream, ABC): primary_key = "id" order_field = "updated_at" page_size = 30 transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) def __init__(self, start_date: str = None, domain_name: str = None, **kwargs): super().__init__(**kwargs) self._start_date = start_date self.domain_name = domain_name @property def url_base(self) -> str: return f"https://{self.domain_name}/api/v2/" def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: next_page = response.links.get("next") if next_page: return { "page": dict(parse_qsl(urlparse( next_page.get("url")).query)).get("page") } def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: params = {"per_page": self.page_size} if next_page_token: params.update(next_page_token) return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: json_response = response.json() records = json_response.get( self.object_name, []) if self.object_name is not None else json_response yield from records def path(self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> str: return self.object_name
class ChannelReports(HttpSubStream): "https://developers.google.com/youtube/reporting/v1/reports/channel_reports" name = None primary_key = None cursor_field = "date" url_base = "" transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) def __init__(self, name: str, dimensions: List[str], **kwargs): self.name = name self.primary_key = dimensions super().__init__(**kwargs) def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: return None def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: fp = io.StringIO(response.text) reader = csv.DictReader(fp) for record in reader: yield record def get_updated_state( self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]: if not current_stream_state: return {self.cursor_field: latest_record[self.cursor_field]} return { self.cursor_field: max(current_stream_state[self.cursor_field], latest_record[self.cursor_field]) } def path(self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> str: return stream_slice["parent"]["downloadUrl"] def read_records(self, *, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]: parent = stream_slice.get("parent") if parent: yield from super().read_records(stream_slice=stream_slice, **kwargs) else: self.logger.info("no data from parent stream") yield from []
def test_transform_wrong_config(): with pytest.raises(Exception, match="NoTransform option cannot be combined with other flags."): TypeTransformer(TransformConfig.NoTransform | TransformConfig.DefaultSchemaNormalization) with pytest.raises(Exception, match="Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer"): class NotAStream: transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) @transformer.registerCustomTransform def transform_cb(instance, schema): pass
class FacebookPagesStream(HttpStream, ABC): url_base = "https://graph.facebook.com/v12.0/" primary_key = "id" data_field = "data" transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) def __init__( self, access_token: str = None, page_id: str = None, **kwargs, ): super().__init__(**kwargs) self._access_token = access_token self._page_id = page_id @property def path_param(self): return self.name[:-1] def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: data = response.json() if not data.get("data") or not data.get("paging"): return {} return { "limit": 100, "after": data.get("paging", {}).get("cursors", {}).get("after"), } def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None, ) -> MutableMapping[str, Any]: next_page_token = next_page_token or {} params = {"access_token": self._access_token, **next_page_token} return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: if not self.data_field: yield response.json() records = response.json().get(self.data_field, []) for record in records: yield record
class ConfluenceStream(HttpStream, ABC): url_base = "https://{}/wiki/rest/api/" primary_key = "id" limit = 50 start = 0 expand = [] transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) def __init__(self, config: Dict): super().__init__(authenticator=config["authenticator"]) self.config = config self.url_base = self.url_base.format(config["domain_name"]) def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: json_response = response.json() links = json_response.get("_links") next_link = links.get("next") if next_link: self.start += self.limit return {"start": self.start} def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: params = {"limit": self.limit, "expand": ",".join(self.expand)} if next_page_token: params.update({"start": next_page_token["start"]}) return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: json_response = response.json() records = json_response.get("results", []) yield from records def path(self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> str: return self.api_name
class SellerFeedbackReports(IncrementalReportsAmazonSPStream): """ Field definitions: https://sellercentral.amazon.com/help/hub/reference/G202125660 """ name = "GET_SELLER_FEEDBACK_DATA" cursor_field = "Date" transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization) @transformer.registerCustomTransform def transform_function(original_value: Any, field_schema: Dict[str, Any]) -> Any: if original_value and "format" in field_schema and field_schema[ "format"] == "date": transformed_value = pendulum.from_format( original_value, "M/D/YY").to_date_string() return transformed_value return original_value
class BulkSalesforceStream(SalesforceStream): page_size = 30000 DEFAULT_WAIT_TIMEOUT_MINS = 10 MAX_CHECK_INTERVAL_SECONDS = 2.0 MAX_RETRY_NUMBER = 3 def __init__(self, wait_timeout: Optional[int], **kwargs): super().__init__(**kwargs) self._wait_timeout = wait_timeout or self.DEFAULT_WAIT_TIMEOUT_MINS def path(self, **kwargs) -> str: return f"/services/data/{self.sf_api.version}/jobs/query" transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization | TransformConfig.DefaultSchemaNormalization) @transformer.registerCustomTransform def transform_empty_string_to_none(instance, schema): """ BULK API returns a `csv` file, where all values are initially as string type. This custom transformer replaces empty lines with `None` value. """ if isinstance(instance, str) and not instance.strip(): instance = None return instance @default_backoff_handler(max_tries=5, factor=15) def _send_http_request(self, method: str, url: str, json: dict = None): headers = self.authenticator.get_auth_header() response = self._session.request(method, url=url, headers=headers, json=json) response.raise_for_status() return response def create_stream_job(self, query: str, url: str) -> Optional[str]: """ docs: https://developer.salesforce.com/docs/atlas.en-us.api_asynch.meta/api_asynch/create_job.htm """ json = { "operation": "queryAll", "query": query, "contentType": "CSV", "columnDelimiter": "COMMA", "lineEnding": "LF" } try: response = self._send_http_request("POST", url, json=json) job_id = response.json()["id"] self.logger.info(f"Created Job: {job_id} to sync {self.name}") return job_id except exceptions.HTTPError as error: if error.response.status_code in [ codes.FORBIDDEN, codes.BAD_REQUEST ]: error_data = error.response.json()[0] if (error_data.get("message", "") == "Selecting compound data not supported in Bulk Query" ) or (error_data.get("errorCode", "") == "INVALIDENTITY" and "is not supported by the Bulk API" in error_data.get("message", "")): self.logger.error( f"Cannot receive data for stream '{self.name}' using BULK API, error message: '{error_data.get('message')}'" ) elif error.response.status_code == codes.FORBIDDEN and not error_data.get( "errorCode", "") == "REQUEST_LIMIT_EXCEEDED": self.logger.error( f"Cannot receive data for stream '{self.name}', error message: '{error_data.get('message')}'" ) else: raise error else: raise error def wait_for_job(self, url: str) -> str: # using "seconds" argument because self._wait_timeout can be changed by tests expiration_time: DateTime = pendulum.now().add( seconds=int(self._wait_timeout * 60.0)) job_status = "InProgress" delay_timeout = 0 delay_cnt = 0 job_info = None # minimal starting delay is 0.5 seconds. # this value was received empirically time.sleep(0.5) while pendulum.now() < expiration_time: job_info = self._send_http_request("GET", url=url).json() job_status = job_info["state"] if job_status in ["JobComplete", "Aborted", "Failed"]: return job_status if delay_timeout < self.MAX_CHECK_INTERVAL_SECONDS: delay_timeout = 0.5 + math.exp(delay_cnt) / 1000.0 delay_cnt += 1 time.sleep(delay_timeout) job_id = job_info["id"] self.logger.info( f"Sleeping {delay_timeout} seconds while waiting for Job: {self.name}/{job_id}" f" to complete. Current state: {job_status}") self.logger.warning( f"Not wait the {self.name} data for {self._wait_timeout} minutes, data: {job_info}!!" ) return job_status def execute_job(self, query: str, url: str) -> str: job_status = "Failed" for i in range(0, self.MAX_RETRY_NUMBER): job_id = self.create_stream_job(query=query, url=url) if not job_id: return None job_full_url = f"{url}/{job_id}" job_status = self.wait_for_job(url=job_full_url) if job_status not in ["UploadComplete", "InProgress"]: break self.logger.error( f"Waiting error. Try to run this job again {i+1}/{self.MAX_RETRY_NUMBER}..." ) self.abort_job(url=job_full_url) job_status = "Aborted" if job_status in ["Aborted", "Failed"]: self.delete_job(url=job_full_url) raise Exception( f"Job for {self.name} stream using BULK API was failed.") return job_full_url def filter_null_bytes(self, s: str): """ https://github.com/airbytehq/airbyte/issues/8300 """ res = s.replace("\x00", "") if len(res) < len(s): self.logger.warning( "Filter 'null' bytes from string, size reduced %d -> %d chars", len(s), len(res)) return res def download_data(self, url: str) -> Tuple[int, dict]: job_data = self._send_http_request("GET", f"{url}/results") decoded_content = self.filter_null_bytes( job_data.content.decode("utf-8")) csv_data = csv.reader(decoded_content.splitlines(), delimiter=",") for i, row in enumerate(csv_data): if i == 0: head = row else: yield i, dict(zip(head, row)) def abort_job(self, url: str): data = {"state": "Aborted"} self._send_http_request("PATCH", url=url, json=data) self.logger.warning("Broken job was aborted") def delete_job(self, url: str): self._send_http_request("DELETE", url=url) def next_page_token(self, last_record: dict) -> str: if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS: return f"WHERE {self.primary_key} >= '{last_record[self.primary_key]}' " def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: stream_state = stream_state or {} next_page_token = None while True: params = self.request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) path = self.path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) job_full_url = self.execute_job(query=params["q"], url=f"{self.url_base}{path}") if not job_full_url: return count = 0 for count, record in self.download_data(url=job_full_url): yield record self.delete_job(url=job_full_url) if count < self.page_size: # this is a last page break next_page_token = self.next_page_token(record) if not next_page_token: # not found a next page data. break
class BigcommerceStream(HttpStream, ABC): # Latest Stable Release api_version = "v3" # Page size limit = 250 # Define primary key as sort key for full_refresh, or very first sync for incremental_refresh primary_key = "id" order_field = "date_modified:asc" filter_field = "date_modified:min" data = "data" transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization) def __init__(self, start_date: str, store_hash: str, access_token: str, **kwargs): super().__init__(**kwargs) self.start_date = start_date self.store_hash = store_hash self.access_token = access_token @transformer.registerCustomTransform def transform_function(original_value: Any, field_schema: Dict[str, Any]) -> Any: """ This functions tries to handle the various date-time formats BigCommerce API returns and normalize the values to isoformat. """ if "format" in field_schema and field_schema["format"] == "date-time": if not original_value: # Some dates are empty strings: "". return None transformed_value = None supported_formats = [ "YYYY-MM-DD", "YYYY-MM-DDTHH:mm:ssZZ", "YYYY-MM-DDTHH:mm:ss[Z]", "ddd, D MMM YYYY HH:mm:ss ZZ" ] for format in supported_formats: try: transformed_value = str( pendulum.from_format( original_value, format)) # str() returns isoformat except ValueError: continue if not transformed_value: raise ValueError( f"Unsupported date-time format for {original_value}") return transformed_value return original_value @property def url_base(self) -> str: return f"https://api.bigcommerce.com/stores/{self.store_hash}/{self.api_version}/" def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: json_response = response.json() meta = json_response.get("meta", None) if meta: pagination = meta.get("pagination", None) if pagination and pagination.get("current_page") < pagination.get( "total_pages"): return dict(page=pagination.get("current_page") + 1) else: return None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: params = {"limit": self.limit} params.update({"sort": self.order_field}) if next_page_token: params.update(**next_page_token) else: params[self.filter_field] = self.start_date return params def request_headers( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> Mapping[str, Any]: headers = super().request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) headers.update({ "Accept": "application/json", "Content-Type": "application/json" }) return headers def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: json_response = response.json() records = json_response.get( self.data, []) if self.data is not None else json_response yield from records
class Engage(MixpanelStream): """Return list of all users API Docs: https://developer.mixpanel.com/reference/engage Endpoint: https://mixpanel.com/api/2.0/engage """ http_method: str = "POST" data_field: str = "results" primary_key: str = "distinct_id" page_size: int = 1000 # min 100 _total: Any = None # enable automatic object mutation to align with desired schema before outputting to the destination transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) def path(self, **kwargs) -> str: return "engage" def request_body_json( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: return {"include_all_users": True} def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: params = {"page_size": self.page_size} if next_page_token: params.update(next_page_token) return params def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: decoded_response = response.json() page_number = decoded_response.get("page") total = decoded_response.get("total") # exist only on first page if total: self._total = total if self._total and page_number is not None and self._total > self.page_size * ( page_number + 1): return { "session_id": decoded_response.get("session_id"), "page": page_number + 1, } else: self._total = None return None def process_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: """ { "page": 0 "page_size": 1000 "session_id": "1234567890-EXAMPL" "status": "ok" "total": 1 "results": [{ "$distinct_id": "9d35cd7f-3f06-4549-91bf-198ee58bb58a" "$properties":{ "$browser":"Chrome" "$browser_version":"83.0.4103.116" "$city":"Leeds" "$country_code":"GB" "$region":"Leeds" "$timezone":"Europe/London" "unblocked":"true" "$email":"*****@*****.**" "$first_name":"Nadine" "$last_name":"Burzler" "$name":"Nadine Burzler" "id":"632540fa-d1af-4535-bc52-e331955d363e" "$last_seen":"2020-06-28T12:12:31" } },{ ... } ] } """ records = response.json().get(self.data_field, {}) for record in records: item = {"distinct_id": record["$distinct_id"]} properties = record["$properties"] for property_name in properties: this_property_name = property_name if property_name.startswith("$"): # Just remove leading '$' for 'reserved' mixpanel properties name, example: # from API: '$browser' # to stream: 'browser' this_property_name = this_property_name[1:] item[this_property_name] = properties[property_name] yield item def get_json_schema(self) -> Mapping[str, Any]: """ :return: A dict of the JSON schema representing this stream. The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. Override as needed. """ schema = super().get_json_schema() # Set whether to allow additional properties for engage and export endpoints # Event and Engage properties are dynamic and depend on the properties provided on upload, # when the Event or Engage (user/person) was created. schema["additionalProperties"] = self.additional_properties types = { "boolean": { "type": ["null", "boolean"] }, "number": { "type": ["null", "number"], "multipleOf": 1e-20 }, "datetime": { "type": ["null", "string"], "format": "date-time" }, "object": { "type": ["null", "object"], "additionalProperties": True }, "list": { "type": ["null", "array"], "required": False, "items": {} }, "string": { "type": ["null", "string"] }, } # read existing Engage schema from API schema_properties = EngageSchema( **self.get_stream_params()).read_records( sync_mode=SyncMode.full_refresh) for property_entry in schema_properties: property_name: str = property_entry["name"] property_type: str = property_entry["type"] if property_name.startswith("$"): # Just remove leading '$' for 'reserved' mixpanel properties name, example: # from API: '$browser' # to stream: 'browser' property_name = property_name[1:] # Do not overwrite 'standard' hard-coded properties, add 'custom' properties if property_name not in schema["properties"]: schema["properties"][property_name] = types.get( property_type, {"type": ["null", "string"]}) return schema
class MondayStream(HttpStream, ABC): url_base: str = "https://api.monday.com/v2" primary_key: str = "id" page: int = 1 transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: json_response = response.json().get("data", {}) records = json_response.get(self.name.lower(), []) self.page += 1 if records: return {"page": self.page} def load_schema(self): """ Load schema from file and make a GraphQL query """ script_dir = os.path.dirname(__file__) schema_path = os.path.join(script_dir, f"schemas/{self.name.lower()}.json") with open(schema_path) as f: schema_dict = json.load(f) schema = schema_dict["properties"] graphql_schema = [] for col in schema: if "properties" in schema[col]: nested_ids = ",".join(schema[col]["properties"]) graphql_schema.append(f"{col}{{{nested_ids}}}") else: graphql_schema.append(col) return ",".join(graphql_schema) def should_retry(self, response: requests.Response) -> bool: # Monday API return code 200 with and errors key if complexity is too high. # https://api.developer.monday.com/docs/complexity-queries is_complex_query = response.json().get("errors") if is_complex_query: self.logger.error(response.text) return response.status_code == 429 or 500 <= response.status_code < 600 or is_complex_query @property def retry_factor(self) -> int: return 15 def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: graphql_params = {} if next_page_token: graphql_params.update(next_page_token) graphql_query = ",".join( [f"{k}:{v}" for k, v in graphql_params.items()]) # Monday uses a query string to pass in environments params = { "query": f"query {{ {self.name.lower()} ({graphql_query}) {{ {self.load_schema()} }} }}" } return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: json_response = response.json().get("data", {}) records = json_response.get(self.name.lower(), []) yield from records def path(self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> str: return ""
class BulkSalesforceStream(SalesforceStream): page_size = 30000 DEFAULT_WAIT_TIMEOUT_SECONDS = 600 MAX_CHECK_INTERVAL_SECONDS = 2.0 MAX_RETRY_NUMBER = 3 def path(self, next_page_token: Mapping[str, Any] = None, **kwargs: Any) -> str: return f"/services/data/{self.sf_api.version}/jobs/query" transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization | TransformConfig.DefaultSchemaNormalization) @default_backoff_handler(max_tries=5, factor=15) def _send_http_request(self, method: str, url: str, json: dict = None, stream: bool = False): headers = self.authenticator.get_auth_header() response = self._session.request(method, url=url, headers=headers, json=json, stream=stream) if response.status_code not in [200, 204]: self.logger.error( f"error body: {response.text}, sobject options: {self.sobject_options}" ) response.raise_for_status() return response def create_stream_job(self, query: str, url: str) -> Optional[str]: """ docs: https://developer.salesforce.com/docs/atlas.en-us.api_asynch.meta/api_asynch/create_job.html """ json = { "operation": "queryAll", "query": query, "contentType": "CSV", "columnDelimiter": "COMMA", "lineEnding": "LF" } try: response = self._send_http_request("POST", url, json=json) job_id: str = response.json()["id"] return job_id except exceptions.HTTPError as error: if error.response.status_code in [ codes.FORBIDDEN, codes.BAD_REQUEST ]: # A part of streams can't be used by BULK API. Every API version can have a custom list of # these sobjects. Another part of them can be generated dynamically. That's why we can't track # them preliminarily and there is only one way is to except error with necessary messages about # their limitations. Now we know about 3 different reasons of similar errors: # 1) some SaleForce sobjects(streams) is not supported by the BULK API simply (as is). # 2) Access to a sobject(stream) is not available # 3) sobject is not queryable. It means this sobject can't be called directly. # We can call it as part of response from another sobject only. E.g.: # initial query: "Select Id, Subject from ActivityHistory" -> error # updated query: "Select Name, (Select Subject,ActivityType from ActivityHistories) from Contact" # The second variant forces customisation for every case (ActivityHistory, ActivityHistories etc). # And the main problem is these subqueries doesn't support CSV response format. error_data = error.response.json()[0] error_code = error_data.get("errorCode") error_message = error_data.get("message", "") if error_message == "Selecting compound data not supported in Bulk Query" or ( error_code == "INVALIDENTITY" and "is not supported by the Bulk API" in error_message): self.logger.error( f"Cannot receive data for stream '{self.name}' using BULK API, " f"sobject options: {self.sobject_options}, error message: '{error_message}'" ) elif error.response.status_code == codes.FORBIDDEN and error_code != "REQUEST_LIMIT_EXCEEDED": self.logger.error( f"Cannot receive data for stream '{self.name}' ," f"sobject options: {self.sobject_options}, error message: '{error_message}'" ) elif error.response.status_code == codes.BAD_REQUEST and error_message.endswith( "does not support query"): self.logger.error( f"The stream '{self.name}' is not queryable, " f"sobject options: {self.sobject_options}, error message: '{error_message}'" ) else: raise error else: raise error return None def wait_for_job(self, url: str) -> str: expiration_time: DateTime = pendulum.now().add( seconds=self.DEFAULT_WAIT_TIMEOUT_SECONDS) job_status = "InProgress" delay_timeout = 0.0 delay_cnt = 0 job_info = None # minimal starting delay is 0.5 seconds. # this value was received empirically time.sleep(0.5) while pendulum.now() < expiration_time: job_info = self._send_http_request("GET", url=url).json() job_status = job_info["state"] if job_status in ["JobComplete", "Aborted", "Failed"]: if job_status != "JobComplete": # this is only job metadata without payload error_message = job_info.get("errorMessage") if not error_message: # not all failed response can have "errorMessage" and we need to show full response body error_message = job_info self.logger.error( f"JobStatus: {job_status}, sobject options: {self.sobject_options}, error message: '{error_message}'" ) return job_status if delay_timeout < self.MAX_CHECK_INTERVAL_SECONDS: delay_timeout = 0.5 + math.exp(delay_cnt) / 1000.0 delay_cnt += 1 time.sleep(delay_timeout) job_id = job_info["id"] self.logger.info( f"Sleeping {delay_timeout} seconds while waiting for Job: {self.name}/{job_id} to complete. Current state: {job_status}" ) self.logger.warning( f"Not wait the {self.name} data for {self.DEFAULT_WAIT_TIMEOUT_SECONDS} seconds, data: {job_info}!!" ) return job_status def execute_job(self, query: str, url: str) -> Tuple[Optional[str], Optional[str]]: job_status = "Failed" for i in range(0, self.MAX_RETRY_NUMBER): job_id = self.create_stream_job(query=query, url=url) if not job_id: return None, None job_full_url = f"{url}/{job_id}" job_status = self.wait_for_job(url=job_full_url) if job_status not in ["UploadComplete", "InProgress"]: break self.logger.error( f"Waiting error. Try to run this job again {i + 1}/{self.MAX_RETRY_NUMBER}..." ) self.abort_job(url=job_full_url) job_status = "Aborted" if job_status in ["Aborted", "Failed"]: self.delete_job(url=job_full_url) return None, job_status return job_full_url, job_status def filter_null_bytes(self, s: str): """ https://github.com/airbytehq/airbyte/issues/8300 """ res = s.replace("\x00", "") if len(res) < len(s): self.logger.warning( "Filter 'null' bytes from string, size reduced %d -> %d chars", len(s), len(res)) return res def download_data(self, url: str, chunk_size: float = 1024) -> os.PathLike: """ Retrieves binary data result from successfully `executed_job`, using chunks, to avoid local memory limitaions. @ url: string - the url of the `executed_job` @ chunk_size: float - the buffer size for each chunk to fetch from stream, in bytes, default: 1024 bytes Returns the string with file path of downloaded binary data. Saved temporarily. """ # set filepath for binary data from response tmp_file = os.path.realpath(os.path.basename(url)) with closing( self._send_http_request("GET", f"{url}/results", stream=True)) as response: with open(tmp_file, "w") as data_file: for chunk in response.iter_content(chunk_size=chunk_size): data_file.writelines( self.filter_null_bytes(self.decode(chunk))) # check the file exists if os.path.isfile(tmp_file): return tmp_file else: raise TmpFileIOError( f"The IO/Error occured while verifying binary data. Stream: {self.name}, file {tmp_file} doesn't exist." ) def read_with_chunks( self, path: str = None, chunk_size: int = 100) -> Iterable[Tuple[int, Mapping[str, Any]]]: """ Reads the downloaded binary data, using lines chunks, set by `chunk_size`. @ path: string - the path to the downloaded temporarily binary data. @ chunk_size: int - the number of lines to read at a time, default: 100 lines / time. """ try: with open(path, "r", encoding=self.encoding) as data: chunks = pd.read_csv(data, chunksize=chunk_size, iterator=True, dialect="unix") for chunk in chunks: chunk = chunk.replace({ nan: None }).to_dict(orient="records") for row in chunk: yield row except pd.errors.EmptyDataError as e: self.logger.info(f"Empty data received. {e}") yield from [] except IOError as ioe: raise TmpFileIOError( f"The IO/Error occured while reading tmp data. Called: {path}. Stream: {self.name}", ioe) finally: # remove binary tmp file, after data is read os.remove(path) def abort_job(self, url: str): data = {"state": "Aborted"} self._send_http_request("PATCH", url=url, json=data) self.logger.warning("Broken job was aborted") def delete_job(self, url: str): self._send_http_request("DELETE", url=url) def next_page_token( self, last_record: Mapping[str, Any]) -> Optional[Mapping[str, Any]]: if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS: return { "next_token": f"WHERE {self.primary_key} >= '{last_record[self.primary_key]}' " } # type: ignore[index] return None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: """ Salesforce SOQL Query: https://developer.salesforce.com/docs/atlas.en-us.232.0.api_rest.meta/api_rest/dome_queryall.htm """ selected_properties = self.get_json_schema().get("properties", {}) query = f"SELECT {','.join(selected_properties.keys())} FROM {self.name} " if next_page_token: query += next_page_token["next_token"] if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS: query += f"ORDER BY {self.primary_key} ASC LIMIT {self.page_size}" return {"q": query} def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: stream_state = stream_state or {} next_page_token = None while True: params = self.request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) path = self.path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) job_full_url, job_status = self.execute_job( query=params["q"], url=f"{self.url_base}{path}") if not job_full_url: if job_status == "Failed": # As rule as BULK logic returns unhandled error. For instance: # error message: 'Unexpected exception encountered in query processing. # Please contact support with the following id: 326566388-63578 (-436445966)'" # Thus we can try to switch to GET sync request because its response returns obvious error message standard_instance = self.get_standard_instance() self.logger.warning( "switch to STANDARD(non-BULK) sync. Because the SalesForce BULK job has returned a failed status" ) yield from standard_instance.read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state) return raise SalesforceException( f"Job for {self.name} stream using BULK API was failed.") count = 0 record: Mapping[str, Any] = {} for record in self.read_with_chunks( self.download_data(url=job_full_url)): count += 1 yield record self.delete_job(url=job_full_url) if count < self.page_size: # Salesforce doesn't give a next token or something to know the request was # the last page. The connectors will sync batches in `page_size` and # considers that batch is smaller than the `page_size` it must be the last page. break next_page_token = self.next_page_token(record) if not next_page_token: # not found a next page data. break def get_standard_instance(self) -> SalesforceStream: """Returns a instance of standard logic(non-BULK) with same settings""" stream_kwargs = dict( sf_api=self.sf_api, pk=self.pk, stream_name=self.stream_name, schema=self.schema, sobject_options=self.sobject_options, authenticator=self.authenticator, ) new_cls: Type[SalesforceStream] = SalesforceStream if isinstance(self, BulkIncrementalSalesforceStream): stream_kwargs.update({ "replication_key": self.replication_key, "start_date": self.start_date }) new_cls = IncrementalSalesforceStream return new_cls(**stream_kwargs)
class SalesforceStream(HttpStream, ABC): page_size = 2000 transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) encoding = DEFAULT_ENCODING def __init__(self, sf_api: Salesforce, pk: str, stream_name: str, sobject_options: Mapping[str, Any] = None, schema: dict = None, **kwargs): super().__init__(**kwargs) self.sf_api = sf_api self.pk = pk self.stream_name = stream_name self.schema: Mapping[str, Any] = schema # type: ignore[assignment] self.sobject_options = sobject_options def decode(self, chunk): """ Most Salesforce instances use UTF-8, but some use ISO-8859-1. By default, we'll decode using UTF-8, and fallback to ISO-8859-1 if it doesn't work. See implementation considerations for more details https://developer.salesforce.com/docs/atlas.en-us.api.meta/api/implementation_considerations.htm """ if self.encoding == DEFAULT_ENCODING: try: decoded = chunk.decode(self.encoding) return decoded except UnicodeDecodeError as e: self.encoding = "ISO-8859-1" self.logger.info( f"Could not decode chunk. Falling back to {self.encoding} encoding. Error: {e}" ) return self.decode(chunk) else: return chunk.decode(self.encoding) @property def name(self) -> str: return self.stream_name @property def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: return self.pk @property def url_base(self) -> str: return self.sf_api.instance_url def path(self, next_page_token: Mapping[str, Any] = None, **kwargs: Any) -> str: if next_page_token: """ If `next_page_token` is set, subsequent requests use `nextRecordsUrl`. """ next_token: str = next_page_token["next_token"] return next_token return f"/services/data/{self.sf_api.version}/queryAll" def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: response_data = response.json() next_token = response_data.get("nextRecordsUrl") return {"next_token": next_token} if next_token else None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: """ Salesforce SOQL Query: https://developer.salesforce.com/docs/atlas.en-us.232.0.api_rest.meta/api_rest/dome_queryall.htm """ if next_page_token: """ If `next_page_token` is set, subsequent requests use `nextRecordsUrl`, and do not include any parameters. """ return {} selected_properties = self.get_json_schema().get("properties", {}) query = f"SELECT {','.join(selected_properties.keys())} FROM {self.name} " if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS: query += f"ORDER BY {self.primary_key} ASC" return {"q": query} def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: yield from response.json()["records"] def get_json_schema(self) -> Mapping[str, Any]: if not self.schema: self.schema = self.sf_api.generate_schema(self.name) return self.schema def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: try: yield from super().read_records(sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state) except exceptions.HTTPError as error: """ There are several types of Salesforce sobjects that require additional processing: 1. Sobjects for which the user, after setting up the data using Airbyte, restricted access, and we will receive 403 HTTP errors. 2. There are streams that do not allow you to make a sample using Salesforce `query` or `queryAll`. And since we use a dynamic method of generating streams for Salesforce connector - at the stage of discover, we cannot filter out these streams, so we catch them at the stage of reading data. """ error_data = error.response.json()[0] if error.response.status_code in [ codes.FORBIDDEN, codes.BAD_REQUEST ]: error_code = error_data.get("errorCode", "") if error_code != "REQUEST_LIMIT_EXCEEDED" or error_code == "INVALID_TYPE_FOR_OPERATION": self.logger.error( f"Cannot receive data for stream '{self.name}', error message: '{error_data.get('message')}'" ) return raise error
class PardotStream(HttpStream, ABC): url_base = "https://pi.pardot.com/api/" api_version = "4" time_filter_template = "%Y-%m-%dT%H:%M:%SZ" primary_key = "id" is_integer_state = False transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) def __init__(self, config: Dict, **kwargs): super().__init__(**kwargs) self.config = config def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: results = response.json().get("result", {}) record_count = results.get("total_results") if record_count and record_count > 0: # The result may be a dict if one record is returned if isinstance(results[self.data_key], list): return { self.filter_param: results[self.data_key][-1][self.cursor_field] } def request_headers( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> Mapping[str, Any]: headers = { "Pardot-Business-Unit-Id": self.config["pardot_business_unit_id"] } return headers def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: params = { "format": "json", } start_date = self.config.get("start_date", None) if start_date: params.update({ "created_after": pendulum.parse(start_date, strict=False).strftime( self.time_filter_template) }) if next_page_token: params.update(**next_page_token) return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: results = response.json().get("result", {}) record_count = results.get("total_results") # The result may be a dict if one record is returned if self.data_key in results and isinstance(results[self.data_key], dict): yield results[self.data_key] elif record_count and record_count > 0 and self.data_key in results: yield from results[self.data_key] def path(self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None) -> str: return f"{self.object_name}/version/{self.api_version}/do/query" def get_updated_state( self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]: blank_val = 0 if self.is_integer_state else "" return { self.cursor_field: max(latest_record.get(self.cursor_field, blank_val), current_stream_state.get(self.cursor_field, blank_val)) } def filter_records_newer_than_state( self, stream_state: Mapping[str, Any] = None, records_slice: Mapping[str, Any] = None) -> Iterable: if stream_state: for record in records_slice: if record[self.cursor_field] >= stream_state.get( self.cursor_field): yield record else: yield from records_slice
class AppsflyerStream(HttpStream, ABC): primary_key = None main_fields = () additional_fields = () maximum_rows = 1_000_000 transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization) def __init__(self, app_id: str, api_token: str, timezone: str, start_date: Union[date, str] = None, end_date: Union[date, str] = None, **kwargs): super().__init__(**kwargs) self.app_id = app_id self.api_token = api_token self.start_date = start_date self.end_date = end_date self.timezone = pendulum.timezone(timezone) @property def url_base(self) -> str: return f"https://hq.appsflyer.com/export/{self.app_id}/" def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: return None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: params = { "api_token": self.api_token, "from": pendulum.yesterday(self.timezone).to_date_string(), "to": pendulum.today(self.timezone).to_date_string(), "timezone": self.timezone.name, "maximum_rows": self.maximum_rows, } if self.additional_fields: additional_fields = (",").join(self.additional_fields) params["additional_fields"] = additional_fields return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: fields = add(self.main_fields, self.additional_fields ) if self.additional_fields else self.main_fields csv_data = map(lambda x: x.decode("utf-8"), response.iter_lines()) reader = csv.DictReader(csv_data, fields) # Skip CSV Header next(reader, {}) yield from reader def is_aggregate_reports_reached_limit( self, response: requests.Response) -> bool: template = "Limit reached for " is_forbidden = response.status_code == HTTPStatus.FORBIDDEN is_template_match = template in response.text return is_forbidden and is_template_match def is_raw_data_reports_reached_limit(self, response: requests.Response) -> bool: template = "Your API calls limit has been reached for report type" is_bad_request = response.status_code == HTTPStatus.BAD_REQUEST is_template_match = template in response.text return is_bad_request and is_template_match def should_retry(self, response: requests.Response) -> bool: is_aggregate_reports_reached_limit = self.is_aggregate_reports_reached_limit( response) is_raw_data_reports_reached_limit = self.is_raw_data_reports_reached_limit( response) is_rejected = is_aggregate_reports_reached_limit or is_raw_data_reports_reached_limit return is_rejected or super().should_retry(response) def backoff_time(self, response: requests.Response) -> Optional[float]: if self.is_raw_data_reports_reached_limit(response): now = pendulum.now("UTC") midnight = pendulum.tomorrow("UTC") wait_time = (midnight - now).seconds elif self.is_aggregate_reports_reached_limit(response): wait_time = 60 else: return super().backoff_time(response) AirbyteLogger().log( "INFO", f"Rate limit exceded. Retry in {wait_time} seconds.") return wait_time @transformer.registerCustomTransform def transform_function(original_value: Any, field_schema: Dict[str, Any]) -> Any: if original_value == "" or original_value == "N/A" or original_value == "NULL": return None if isinstance(original_value, float): return Decimal(original_value) return original_value
class FBMarketingStream(Stream, ABC): """Base stream class""" primary_key = "id" transformer: TypeTransformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) page_size = 100 enable_deleted = False entity_prefix = None def __init__(self, api: API, include_deleted: bool = False, **kwargs): super().__init__(**kwargs) self._api = api self._include_deleted = include_deleted if self.enable_deleted else False @cached_property def fields(self) -> List[str]: """List of fields that we want to query, for now just all properties from stream's schema""" return list(self.get_json_schema().get("properties", {}).keys()) @backoff_policy def execute_in_batch(self, requests: Iterable[FacebookRequest]) -> Sequence[MutableMapping[str, Any]]: """Execute list of requests in batches""" records = [] def success(response: FacebookResponse): records.append(response.json()) def failure(response: FacebookResponse): raise response.error() api_batch: FacebookAdsApiBatch = self._api.api.new_batch() for request in requests: api_batch.add_request(request, success=success, failure=failure) retry_batch = api_batch.execute() if retry_batch: raise FacebookAPIException(f"Batch has failed {len(retry_batch)} requests") return records def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: """Main read method used by CDK""" for record in self._read_records(params=self.request_params(stream_state=stream_state)): yield self._extend_record(record, fields=self.fields) def _read_records(self, params: Mapping[str, Any]) -> Iterable: """Wrapper around query to backoff errors. We have default implementation because we still can override read_records so this method is not mandatory. """ return [] @backoff_policy def _extend_record(self, obj: Any, **kwargs): """Wrapper around api_get to backoff errors""" return obj.api_get(**kwargs).export_all_data() def request_params(self, **kwargs) -> MutableMapping[str, Any]: """Parameters that should be passed to query_records method""" params = {"limit": self.page_size} if self._include_deleted: params.update(self._filter_all_statuses()) return params def _filter_all_statuses(self) -> MutableMapping[str, Any]: """Filter that covers all possible statuses thus including deleted/archived records""" filt_values = [ "active", "archived", "completed", "limited", "not_delivering", "deleted", "not_published", "pending_review", "permanently_deleted", "recently_completed", "recently_rejected", "rejected", "scheduled", "inactive", ] return { "filtering": [ {"field": f"{self.entity_prefix}.delivery_info", "operator": "IN", "value": filt_values}, ], }
class FullRefreshTiktokStream(TiktokStream, ABC): primary_key = "id" fields: List[str] = None transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization) @transformer.registerCustomTransform def transform_function(original_value: Any, field_schema: Dict[str, Any]) -> Any: """Custom traun""" if original_value == "-": return None elif isinstance(original_value, float): return Decimal(original_value) return original_value def __init__(self, start_date: str, end_date: str, **kwargs): super().__init__(**kwargs) self.kwargs = kwargs # convert a start date to TikTok format # example: "2021-08-24" => "2021-08-24 00:00:00" self._start_time = pendulum.parse( start_date or DEFAULT_START_DATE).strftime("%Y-%m-%d 00:00:00") # convert end date to TikTok format # example: "2021-08-24" => "2021-08-24 00:00:00" self._end_time = pendulum.parse( end_date or DEFAULT_END_DATE).strftime("%Y-%m-%d 00:00:00") self.max_cursor_date = None self._advertiser_ids = [] @staticmethod def convert_array_param(arr: List[Union[str, int]]) -> str: return json.dumps(arr) def get_advertiser_ids(self) -> Iterable[int]: if self.is_sandbox: # for sandbox: just return advertiser_id provided in spec ids = [self._advertiser_id] else: # for prod: return list of all available ids from AdvertiserIds stream: advertiser_ids = AdvertiserIds(**self.kwargs).read_records( sync_mode=SyncMode.full_refresh) ids = [ advertiser["advertiser_id"] for advertiser in advertiser_ids ] self._advertiser_ids = ids return ids def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: """Each stream slice is for separate advertiser id""" self.get_advertiser_ids() while self._advertiser_ids: # self._advertiser_ids need to be exhausted so that JsonUpdatedState knows # when all stream slices are processed (stream.is_finished) advertiser_id = self._advertiser_ids.pop(0) yield {"advertiser_id": advertiser_id} @property def is_finished(self): return len(self._advertiser_ids) == 0 def request_params( self, stream_state: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, **kwargs, ) -> MutableMapping[str, Any]: params = {"page_size": self.page_size} if self.fields: params["fields"] = self.convert_array_param(self.fields) if stream_slice: params.update(stream_slice) return params
class FBMarketingStream(Stream, ABC): """Base stream class""" primary_key = "id" transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization) # use batch API to retrieve details for each record in a stream use_batch = True # this flag will override `include_deleted` option for streams that does not support it enable_deleted = True # entity prefix for `include_deleted` filter, it usually matches singular version of stream name entity_prefix = None def __init__(self, api: "API", include_deleted: bool = False, page_size: int = 100, max_batch_size: int = 50, **kwargs): super().__init__(**kwargs) self._api = api self.page_size = page_size if page_size is not None else 100 self._include_deleted = include_deleted if self.enable_deleted else False self.max_batch_size = max_batch_size if max_batch_size is not None else 50 @cached_property def fields(self) -> List[str]: """List of fields that we want to query, for now just all properties from stream's schema""" return list(self.get_json_schema().get("properties", {}).keys()) def _execute_batch(self, batch: FacebookAdsApiBatch) -> None: """Execute batch, retry in case of failures""" while batch: batch = batch.execute() if batch: logger.info("Retry failed requests in batch") def execute_in_batch( self, pending_requests: Iterable[FacebookRequest] ) -> Iterable[MutableMapping[str, Any]]: """Execute list of requests in batches""" records = [] def success(response: FacebookResponse): records.append(response.json()) def failure(response: FacebookResponse): raise RuntimeError( f"Batch request failed with response: {response.body()}") api_batch: FacebookAdsApiBatch = self._api.api.new_batch() for request in pending_requests: api_batch.add_request(request, success=success, failure=failure) if len(api_batch) == self.max_batch_size: self._execute_batch(api_batch) yield from records records = [] api_batch: FacebookAdsApiBatch = self._api.api.new_batch() self._execute_batch(api_batch) yield from records def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: """Main read method used by CDK""" records_iter = self.list_objects(params=self.request_params( stream_state=stream_state)) loaded_records_iter = (record.api_get(fields=self.fields, pending=self.use_batch) for record in records_iter) if self.use_batch: loaded_records_iter = self.execute_in_batch(loaded_records_iter) for record in loaded_records_iter: if isinstance(record, AbstractObject): yield record.export_all_data() # convert FB object to dict else: yield record # execute_in_batch will emmit dicts @abstractmethod def list_objects(self, params: Mapping[str, Any]) -> Iterable: """List FB objects, these objects will be loaded in read_records later with their details. :param params: params to make request :return: list of FB objects to load """ def request_params(self, **kwargs) -> MutableMapping[str, Any]: """Parameters that should be passed to query_records method""" params = {"limit": self.page_size} if self._include_deleted: params.update(self._filter_all_statuses()) return params def _filter_all_statuses(self) -> MutableMapping[str, Any]: """Filter that covers all possible statuses thus including deleted/archived records""" filt_values = [ "active", "archived", "completed", "limited", "not_delivering", "deleted", "not_published", "pending_review", "permanently_deleted", "recently_completed", "recently_rejected", "rejected", "scheduled", "inactive", ] return { "filtering": [ { "field": f"{self.entity_prefix}.delivery_info", "operator": "IN", "value": filt_values }, ], }
class TwilioStream(HttpStream, ABC): url_base = TWILIO_API_URL_BASE primary_key = "sid" page_size = 1000 transformer: TypeTransformer = TypeTransformer( TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization) def __init__(self, **kwargs): super().__init__(**kwargs) @property def data_field(self): return self.name @property def changeable_fields(self): """ :return list of changeable fields that should be removed from the records """ return [] def path(self, **kwargs): return f"{self.name.title()}.json" def next_page_token( self, response: requests.Response) -> Optional[Mapping[str, Any]]: stream_data = response.json() next_page_uri = stream_data.get("next_page_uri") if next_page_uri: next_url = urlparse(next_page_uri) next_page_params = dict(parse_qsl(next_url.query)) return next_page_params def parse_response(self, response: requests.Response, stream_state: Mapping[str, Any], **kwargs) -> Iterable[Mapping]: """ :return an iterable containing each record in the response """ records = response.json().get(self.data_field, []) if self.changeable_fields: for record in records: for field in self.changeable_fields: record.pop(field, None) yield record yield from records def backoff_time(self, response: requests.Response) -> Optional[float]: """This method is called if we run into the rate limit. Twilio puts the retry time in the `Retry-After` response header so we we return that value. If the response is anything other than a 429 (e.g: 5XX) fall back on default retry behavior. Rate Limits Docs: https://support.twilio.com/hc/en-us/articles/360032845014-Verify-V2-Rate-Limiting""" backoff_time = response.headers.get("Retry-After") if backoff_time is not None: return float(backoff_time) def request_params(self, stream_state: Mapping[str, Any], next_page_token: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]: params = super().request_params(stream_state=stream_state, next_page_token=next_page_token, **kwargs) params["PageSize"] = self.page_size if next_page_token: params.update(**next_page_token) return params @transformer.registerCustomTransform def custom_transform_function(original_value: Any, field_schema: Mapping[str, Any]) -> Any: if original_value and field_schema.get("format") == "date-time": try: return pendulum.from_format( original_value, "ddd, D MMM YYYY HH:mm:ss ZZ").in_timezone( "UTC").to_iso8601_string() except ValueError: # Twilio API returns datetime in two formats: # - RFC2822, like "Fri, 11 Dec 2020 04:28:40 +0000"; # - ISO8601, like "2020-12-11T04:29:09Z". # If `ValueError` exception was raised this means that datetime was already in ISO8601 format and there # is no need in transforming anything. pass return original_value
class Stream(ABC): """ Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol. """ # Use self.logger in subclasses to log any messages @property def logger(self): return logging.getLogger(f"airbyte.streams.{self.name}") # TypeTransformer object to perform output data transformation transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform) @property def name(self) -> str: """ :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. """ return casing.camel_to_snake(self.__class__.__name__) @abstractmethod def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: """ This method should be overridden by subclasses to read records based on the inputs """ def get_json_schema(self) -> Mapping[str, Any]: """ :return: A dict of the JSON schema representing this stream. The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. Override as needed. """ # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec return ResourceSchemaLoader(package_name_from_class( self.__class__)).get_schema(self.name) def as_airbyte_stream(self) -> AirbyteStream: stream = AirbyteStream(name=self.name, json_schema=dict(self.get_json_schema()), supported_sync_modes=[SyncMode.full_refresh]) if self.supports_incremental: stream.source_defined_cursor = self.source_defined_cursor stream.supported_sync_modes.append( SyncMode.incremental) # type: ignore stream.default_cursor_field = self._wrapped_cursor_field() keys = Stream._wrapped_primary_key(self.primary_key) if keys and len(keys) > 0: stream.source_defined_primary_key = keys return stream @property def supports_incremental(self) -> bool: """ :return: True if this stream supports incrementally reading data """ return len(self._wrapped_cursor_field()) > 0 def _wrapped_cursor_field(self) -> List[str]: return [self.cursor_field] if isinstance(self.cursor_field, str) else self.cursor_field @property def cursor_field(self) -> Union[str, List[str]]: """ Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. """ return [] @property def source_defined_cursor(self) -> bool: """ Return False if the cursor can be configured by the user. """ return True @property @abstractmethod def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: """ :return: string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. If the stream has no primary keys, return None. """ def stream_slices( self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None ) -> Iterable[Optional[Mapping[str, Any]]]: """ Override to define the slices for this stream. See the stream slicing section of the docs for more information. :param sync_mode: :param cursor_field: :param stream_state: :return: """ return [None] @property def state_checkpoint_interval(self) -> Optional[int]: """ Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source. Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled. return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read. """ return None def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): """ Override to extract state from the latest record. Needed to implement incremental sync. Inspects the latest record extracted from the data source and the current state object and return an updated state object. For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object. :param current_stream_state: The stream's current state object :param latest_record: The latest record extracted from the stream :return: An updated state object """ return {} @staticmethod def _wrapped_primary_key( keys: Optional[Union[str, List[str], List[List[str]]]] ) -> Optional[List[List[str]]]: """ :return: wrap the primary_key property in a list of list of strings required by the Airbyte Stream object. """ if not keys: return None if isinstance(keys, str): return [[keys]] elif isinstance(keys, list): wrapped_keys = [] for component in keys: if isinstance(component, str): wrapped_keys.append([component]) elif isinstance(component, list): wrapped_keys.append(component) else: raise ValueError("Element must be either list or str.") return wrapped_keys else: raise ValueError("Element must be either list or str.")
class SalesforceStream(HttpStream, ABC): page_size = 2000 transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) def __init__(self, sf_api: Salesforce, pk: str, stream_name: str, schema: dict = None, **kwargs): super().__init__(**kwargs) self.sf_api = sf_api self.pk = pk self.stream_name = stream_name self.schema = schema @property def name(self) -> str: return self.stream_name @property def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: return self.pk @property def url_base(self) -> str: return self.sf_api.instance_url def path(self, **kwargs) -> str: return f"/services/data/{self.sf_api.version}/queryAll" def next_page_token(self, response: requests.Response) -> str: response_data = response.json() if len( response_data["records"] ) == self.page_size and self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS: return f"WHERE {self.primary_key} >= '{response_data['records'][-1][self.primary_key]}' " def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None) -> MutableMapping[str, Any]: """ Salesforce SOQL Query: https://developer.salesforce.com/docs/atlas.en-us.232.0.api_rest.meta/api_rest/dome_queryall.htm """ selected_properties = self.get_json_schema().get("properties", {}) # Salesforce BULK API currently does not support loading fields with data type base64 and compound data if self.sf_api.api_type == "BULK": selected_properties = { key: value for key, value in selected_properties.items() if value.get("format") != "base64" and "object" not in value["type"] } query = f"SELECT {','.join(selected_properties.keys())} FROM {self.name} " if next_page_token: query += next_page_token if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS: query += f"ORDER BY {self.primary_key} ASC LIMIT {self.page_size}" return {"q": query} def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: yield from response.json()["records"] def get_json_schema(self) -> Mapping[str, Any]: if not self.schema: self.schema = self.sf_api.generate_schema([self.name]) return self.schema def read_records(self, **kwargs) -> Iterable[Mapping[str, Any]]: try: yield from super().read_records(**kwargs) except exceptions.HTTPError as error: error_data = error.response.json()[0] if error.response.status_code == codes.FORBIDDEN and not error_data.get( "errorCode", "") == "REQUEST_LIMIT_EXCEEDED": self.logger.error( f"Cannot receive data for stream '{self.name}', error message: '{error_data.get('message')}'" ) else: raise error
class FreshsalesStream(HttpStream, ABC): url_base = "https://{}/crm/sales/api/" primary_key = "id" order_field = "updated_at" transformer: TypeTransformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) def __init__(self, domain_name: str, **kwargs): super().__init__(**kwargs) self.url_base = self.url_base.format(domain_name) self.domain_name = domain_name self.page = 1 def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: """ There is no next page token in the respond so incrementing the page param until there is no new result """ list_result = response.json().get(self.object_name, []) if list_result: self.page += 1 return self.page else: return None def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None ) -> MutableMapping[str, Any]: params = {"page": self.page, "sort": self.order_field, "sort_type": "asc"} return params def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: json_response = response.json() records = json_response.get(self.object_name, []) if self.object_name is not None else json_response yield from records def _get_filters(self) -> List: """ Some streams require a filter_id to be passed in. This function gets all available filters. """ filters_url = f"https://{self.domain_name}/crm/sales/api/{self.object_name}/filters" auth = self.authenticator.get_auth_header() try: r = requests.get(filters_url, headers=auth) r.raise_for_status() return r.json().get("filters") except requests.exceptions.RequestException as e: raise e def get_view_id(self): """ This function iterate over all available filters and get the relevant filter_id. """ if hasattr(self, "filter_name"): filters = self._get_filters() return next(filter["id"] for filter in filters if filter["name"] == self.filter_name) else: return def path( self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None ) -> str: view_id = self.get_view_id() return f"{self.object_name}/view/{view_id}"
class SourceZendeskSupportStream(HttpStream, ABC): """ "Basic Zendesk class""" primary_key = "id" page_size = 100 created_at_field = "created_at" updated_at_field = "updated_at" transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) def __init__(self, subdomain: str, **kwargs): super().__init__(**kwargs) # add the custom value for generation of a zendesk domain self._subdomain = subdomain @property def url_base(self) -> str: return f"https://{self._subdomain}.zendesk.com/api/v2/" @staticmethod def _parse_next_page_number(response: requests.Response) -> Optional[int]: """Parses a response and tries to find next page number""" next_page = response.json().get("next_page") if next_page: return dict(parse_qsl(urlparse(next_page).query)).get("page") return None def backoff_time(self, response: requests.Response) -> Union[int, float]: """ The rate limit is 700 requests per minute # monitoring-your-request-activity See https://developer.zendesk.com/api-reference/ticketing/account-configuration/usage_limits/ The response has a Retry-After header that tells you for how many seconds to wait before retrying. """ retry_after = int(response.headers.get("Retry-After", 0)) if retry_after and retry_after > 0: return int(retry_after) # the header X-Rate-Limit returns a amount of requests per minute # we try to wait twice as long rate_limit = float(response.headers.get("X-Rate-Limit", 0)) if rate_limit and rate_limit > 0: return (60.0 / rate_limit) * 2 return super().backoff_time(response) @staticmethod def str2datetime(str_dt: str) -> datetime: """convert string to datetime object Input example: '2021-07-22T06:55:55Z' FORMAT : "%Y-%m-%dT%H:%M:%SZ" """ if not str_dt: return None return datetime.strptime(str_dt, DATETIME_FORMAT) @staticmethod def datetime2str(dt: datetime) -> str: """convert datetime object to string Output example: '2021-07-22T06:55:55Z' FORMAT : "%Y-%m-%dT%H:%M:%SZ" """ return datetime.strftime(dt.replace(tzinfo=pytz.UTC), DATETIME_FORMAT)