def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: streams = [] # Get the queue name by getting substring after last / stream_name = self.parse_queue_name(config["queue_url"]) logger.debug("Amazon SQS Source Stream Discovery - stream is: " + stream_name) json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "id": { "type": "string" }, "body": { "type": "string" }, "attributes": { "type": ["object", "null"] } }, } streams.append( AirbyteStream(name=stream_name, json_schema=json_schema, supported_sync_modes=["full_refresh"])) return AirbyteCatalog(streams=streams)
def establish_connection(config: json, logger: AirbyteLogger) -> Connection: """ Creates a connection to Firebolt database using the parameters provided. :param config: Json object containing db credentials. :param logger: AirbyteLogger instance to print logs. :return: PEP-249 compliant database Connection object. """ logger.debug("Connecting to Firebolt.") connection = connect(**parse_config(config, logger)) logger.debug("Connection to Firebolt established.") return connection
async def establish_async_connection(config: json, logger: AirbyteLogger) -> AsyncConnection: """ Creates an async connection to Firebolt database using the parameters provided. This connection can be used for parallel operations. :param config: Json object containing db credentials. :param logger: AirbyteLogger instance to print logs. :return: PEP-249 compliant database Connection object. """ logger.debug("Connecting to Firebolt.") connection = await async_connect(**parse_config(config, logger)) logger.debug("Connection to Firebolt established.") return connection
def json_type_to_pyarrow_type( typ: str, reverse: bool = False, logger: AirbyteLogger = AirbyteLogger()) -> str: """ Converts Json Type to PyArrow types to (or the other way around if reverse=True) :param typ: Json type if reverse is False, else PyArrow type :param reverse: switch to True for PyArrow type -> Json type, defaults to False :param logger: defaults to AirbyteLogger() :return: PyArrow type if reverse is False, else Json type """ str_typ = str(typ) # this is a map of airbyte types to pyarrow types. The first list element of the pyarrow types should be the one to use where required. map = { "boolean": ("bool_", "bool"), "integer": ("int64", "int8", "int16", "int32", "uint8", "uint16", "uint32", "uint64"), "number": ("float64", "float16", "float32", "decimal128", "decimal256", "halffloat", "float", "double"), "string": ("large_string", "string"), # TODO: support object type rather than coercing to string "object": ("large_string", ), # TODO: support array type rather than coercing to string "array": ("large_string", ), "null": ("large_string", ), } if not reverse: for json_type, pyarrow_types in map.items(): if str_typ.lower() == json_type: return str( getattr(pa, pyarrow_types[0]).__call__() ) # better way might be necessary when we decide to handle more type complexity logger.debug( f"JSON type '{str_typ}' is not mapped, falling back to default conversion to large_string" ) return str(pa.large_string()) else: for json_type, pyarrow_types in map.items(): if any( str_typ.startswith(pa_type) for pa_type in pyarrow_types): return json_type logger.debug( f"PyArrow type '{str_typ}' is not mapped, falling back to default conversion to string" ) return "string" # default type if unspecified in map
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: stream_name = self.parse_queue_name(config["queue_url"]) logger.debug("Amazon SQS Source Read - stream is: " + stream_name) # Required propeties queue_url = config["queue_url"] queue_region = config["region"] delete_messages = config["delete_messages"] # Optional Properties max_batch_size = config.get("max_batch_size", 10) max_wait_time = config.get("max_wait_time", 20) visibility_timeout = config.get("visibility_timeout") attributes_to_return = config.get("attributes_to_return") if attributes_to_return is None: attributes_to_return = ["All"] else: attributes_to_return = attributes_to_return.split(",") # Senstive Properties access_key = config["access_key"] secret_key = config["secret_key"] logger.debug("Amazon SQS Source Read - Creating SQS connection ---") session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=queue_region) sqs = session.resource("sqs") queue = sqs.Queue(url=queue_url) logger.debug("Amazon SQS Source Read - Connected to SQS Queue ---") timed_out = False while not timed_out: try: logger.debug("Amazon SQS Source Read - Beginning message poll ---") messages = queue.receive_messages( MessageAttributeNames=attributes_to_return, MaxNumberOfMessages=max_batch_size, WaitTimeSeconds=max_wait_time ) if not messages: logger.debug("Amazon SQS Source Read - No messages recieved during poll, time out reached ---") timed_out = True break for msg in messages: logger.debug("Amazon SQS Source Read - Message recieved: " + msg.message_id) if visibility_timeout: logger.debug("Amazon SQS Source Read - Setting message visibility timeout: " + msg.message_id) self.change_message_visibility(msg, visibility_timeout) logger.debug("Amazon SQS Source Read - Message visibility timeout set: " + msg.message_id) data = { "id": msg.message_id, "body": msg.body, "attributes": msg.message_attributes, } # TODO: Support a 'BATCH OUTPUT' mode that outputs the full batch in a single AirbyteRecordMessage yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) if delete_messages: logger.debug("Amazon SQS Source Read - Deleting message: " + msg.message_id) self.delete_message(msg) logger.debug("Amazon SQS Source Read - Message deleted: " + msg.message_id) # TODO: Delete messages in batches to reduce amount of requests? except ClientError as error: raise Exception("Error in AWS Client: " + str(error))
def check(self, logger: AirbyteLogger, config: json) -> AirbyteConnectionStatus: try: if "max_batch_size" in config: # Max batch size must be between 1 and 10 if config["max_batch_size"] > 10 or config["max_batch_size"] < 1: raise Exception("max_batch_size must be between 1 and 10") if "max_wait_time" in config: # Max wait time must be between 1 and 20 if config["max_wait_time"] > 20 or config["max_wait_time"] < 1: raise Exception("max_wait_time must be between 1 and 20") # Required propeties queue_url = config["queue_url"] logger.debug("Amazon SQS Source Config Check - queue_url: " + queue_url) queue_region = config["region"] logger.debug("Amazon SQS Source Config Check - region: " + queue_region) # Senstive Properties access_key = config["access_key"] logger.debug("Amazon SQS Source Config Check - access_key (ends with): " + access_key[-1]) secret_key = config["secret_key"] logger.debug("Amazon SQS Source Config Check - secret_key (ends with): " + secret_key[-1]) logger.debug("Amazon SQS Source Config Check - Starting connection test ---") session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=queue_region) sqs = session.resource("sqs") queue = sqs.Queue(url=queue_url) if hasattr(queue, "attributes"): logger.debug("Amazon SQS Source Config Check - Connection test successful ---") return AirbyteConnectionStatus(status=Status.SUCCEEDED) else: return AirbyteConnectionStatus(status=Status.FAILED, message="Amazon SQS Source Config Check - Could not connect to queue") except ClientError as e: return AirbyteConnectionStatus(status=Status.FAILED, message=f"Amazon SQS Source Config Check - Error in AWS Client: {str(e)}") except Exception as e: return AirbyteConnectionStatus( status=Status.FAILED, message=f"Amazon SQS Source Config Check - An exception occurred: {str(e)}" )