def insert(self, data): """ Insert given data to Azure Blob Storage :param dict data: a dictionary containing the data to be inserted """ # The field 'blob' must be present in the data. It contains the blob file path to be uploaded. if 'blob' not in data: raise ConnectorError("Field 'blob' is not given in the input") file_path = data['blob'] filename = os.path.basename(file_path) path_prefix = get_variable('PATH_PREFIX', '') # Generate the object path by concatenating (optional) path prefix and filename object_path = os.path.join(path_prefix, filename) try: blob_client = self.blob_service_client.get_blob_client( container=get_variable('CONTAINER'), blob=object_path) with open(file_path, "rb") as data: blob_client.upload_blob(data=data, timeout=int( get_variable('TIMEOUT', '10'))) except azure.core.exceptions.AzureError as e: raise RecoverableConnectorError(f"Failed to insert blob: {e}") logger.info("Blob inserted successfully")
def insert(self, data): """ Insert given data to GCS :param dict data: a dictionary containing the data to be inserted """ # The field 'blob' must be present in the data. It contains the blob file path to be uploaded. if 'blob' not in data: raise ConnectorError("Field 'blob' is not given in the input") file_path = data['blob'] filename = os.path.basename(file_path) bucket = get_variable('BUCKET') path_prefix = get_variable('PATH_PREFIX', '') # Generate the destination blob name by concatenating (optional) path prefix and filename object_path = os.path.join(path_prefix, filename) try: bucket = self.client.bucket(bucket_name=bucket) blob = bucket.blob(object_path, chunk_size=self.chunk_size) blob.upload_from_filename(filename=file_path) except exceptions.GoogleCloudError as e: raise RecoverableConnectorError(f"Failed to insert blob: {e}") logger.info("Blob inserted successfully")
def insert(self, data): """ Insert given data to S3 storage :param dict data: a dictionary containing the data to be inserted """ # The field 'blob' must be present in the data. It contains the blob file path to be uploaded. if 'blob' not in data: raise ConnectorError("Field 'blob' is not given in the input") file_path = data['blob'] filename = os.path.basename(file_path) bucket = get_variable('BUCKET') path_prefix = get_variable('PATH_PREFIX', '') # Generate the object path by concatenating (optional) path prefix and filename object_path = os.path.join(path_prefix, filename) try: # The upload_file method accepts a file name, a bucket name, and an object name. The method handles large # files by splitting them into smaller chunks and uploading each chunk in parallel. self.s3_client.upload_file(file_path, bucket, object_path) except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e: raise RecoverableConnectorError(f"Failed to insert blob: {e}") except boto3.exceptions.S3UploadFailedError as e: raise ConnectorError(f"Failed to insert blob: {e}") logger.info("Blob inserted successfully")
def connect(self): """ Connect to RabbitMQ :return: a new connection object to RabbitMQ """ parameters = pika.ConnectionParameters( host=get_variable('HOST'), port=int(get_variable('PORT')), credentials=pika.PlainCredentials( username=get_variable('USERNAME'), password=get_variable('PASSWORD')), virtual_host=get_variable('VIRTUAL_HOST', '/'), socket_timeout=int(get_variable('TIMEOUT', '10')), blocked_connection_timeout=int(get_variable('TIMEOUT', '10')), stack_timeout=int(get_variable('TIMEOUT', '10')), heartbeat=int(get_variable('TIMEOUT', '10'))) try: self.connection = pika.BlockingConnection(parameters) except pika.exceptions.ProbableAuthenticationError as e: raise ConnectorError( f"Failed to authenticate at RabbitMQ server: {e}") except (socket.gaierror, pika.exceptions.AMQPError) as e: raise RecoverableConnectorError( f"Failed to connect to RabbitMQ: {e}")
def insert(self, data): """ Insert given data to Kinesis stream :param dict data: a dictionary containing the data to be inserted """ # Convert the input data dictionary to a single Kinesis record. Generate the partition key randomly. record = { 'Data': json.dumps(data), 'PartitionKey': ''.join( random.sample( string.ascii_uppercase + string.ascii_lowercase + string.digits, 16)) } try: self.kinesis_client.put_record(StreamName=get_variable('STREAM'), Data=record['Data'], PartitionKey=record['PartitionKey']) except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e: raise RecoverableConnectorError( f"Failed to insert record: {e.response['Error']['Message']}") logger.info("Data inserted successfully")
def setup(self): """ Connect to Amazon Kinesis Stream :return: boto3.client client: the client representing the Kinesis stream """ try: return boto3.client( service_name='kinesis', region_name=get_variable('REGION'), aws_access_key_id=get_variable('ACCESS_KEY'), aws_secret_access_key=get_variable('SECRET_KEY')) except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e: raise ConnectorError(f"Failed to initialise Kinesis client {e}")
def setup(self): """ Setup the S3 client :return: boto3.client client: the client representing the S3 storage """ try: return boto3.client( service_name='s3', region_name=get_variable('REGION'), aws_access_key_id=get_variable('ACCESS_KEY'), aws_secret_access_key=get_variable('SECRET_KEY') ) except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e: raise ConnectorError(f"Failed to initialise S3 client {e}")
def connect(self): """ Connect to InfluxDB server database :return: connection: a new connection object to the database """ self.connection = None try: self.connection = influxdb_client.InfluxDBClient( url=get_variable('URL'), token=get_variable('TOKEN'), org=get_variable('ORGANIZATION'), debug=False).write_api(write_options=SYNCHRONOUS) except urllib3.exceptions.HTTPError as e: raise RecoverableConnectorError( f"Failed to connect to database: {e}")
def insert(self, data): """ Insert given data to InfluxDB bucket. If the insert fails due to lost database connection, retry inserting. :param dict data: a dictionary containing the data to be inserted """ # Including the connect in the insert method such that it can benefit from retrying if not self.connection: self.connect() point_values = {} if 'time' in data: point_values['time'] = data['time'] if 'tags' in data and data['tags']: point_values['tags'] = self._extract_elements( elements_csv=data['tags']) try: point_values['measurement'] = data['measurement'] fields = self._extract_elements(elements_csv=data['fields'], elements_type='fields') if fields: point_values['fields'] = fields else: raise ConnectorError( "Request input field named 'fields' must contain at least one element" ) point = influxdb_client.Point.from_dict( dictionary=point_values, write_precision=self._match_write_precision(data)) self.connection.write(get_variable('BUCKET'), record=point) except dateutil.parser._parser.ParserError: self.connection = None raise ConnectorError( f"Failed to insert data: request input field 'time' is wrongly formatted" ) except KeyError as e: self.connection = None raise ConnectorError( f"Failed to insert data: request data missing required input field named {e}" ) except influxdb_client.rest.ApiException as e: self.connection = None raise ConnectorError(f"Failed to insert data: {e}") logger.info("Data inserted successfully")
def connect(self): """ Connect to MSSQL database :return: connection: a new connection object to the database """ self.connection = None username = get_variable('USERNAME') password = get_variable('PASSWORD') host = get_variable('HOST') port = get_variable('PORT', '1433') database = get_variable('DATABASE') timeout = get_variable('TIMEOUT', '60') try: connection_string = f"mssql+pyodbc://{username}:{password}@{host}:{port}/{database}?" \ "driver=ODBC Driver 17 for SQL Server" # fast_executemany speeds up the insertion up to a 100-fold self.connection = sqlalchemy.create_engine(url=connection_string, fast_executemany=True, connect_args={ "timeout": int(timeout) }).connect() except (sqlalchemy.exc.OperationalError, sqlalchemy.exc.InterfaceError) as e: raise RecoverableConnectorError( f"Failed to connect to database: {e}")
def setup(self): """ Setup the Azure Blob Storage client :return: azure.storage.blob.BlobServiceClient client: a blob service client """ try: return azure.storage.blob.BlobServiceClient.from_connection_string( conn_str=get_variable('CONNECTION_STRING', '')) except azure.core.exceptions.AzureError as e: raise ConnectorError( f"Failed to initialise Azure Storage client: {e}")
def setup(self): """ Setup the GCS client :return: storage.Client client: the client which is created with the retrieved JSON credentials """ try: self.key_file.write( bytearray(get_variable('JSON_KEY_FILE'), 'utf-8')) self.key_file.flush() return storage.Client.from_service_account_json( json_credentials_path=self.key_file.name) except (json.decoder.JSONDecodeError, TypeError, ValueError) as e: raise ConnectorError(f"Failed to initialise GCS client: {e}")
def connect(self): """ Connect to PostgreSQL database :return: connection: a new connection object to the database """ self.connection = None try: self.connection = psycopg2.connect( host=get_variable('HOST'), port=int(get_variable('PORT', '5432')), user=get_variable('USERNAME'), password=get_variable('PASSWORD'), database=get_variable('DATABASE'), connect_timeout=int(get_variable('TIMEOUT', '10')) ) except psycopg2.Error as e: raise RecoverableConnectorError(f"Failed to connect to database: {e}")
def connect(self): """ Connect to MySQL database :return: connection: a new connection object to the database """ self.connection = None try: self.connection = MySQLdb.connect( host=get_variable('HOST'), port=int(get_variable('PORT', '3306')), user=get_variable('USERNAME'), password=get_variable('PASSWORD'), database=get_variable('DATABASE'), connect_timeout=int(get_variable('TIMEOUT', '10')), ) except (MySQLdb.OperationalError, MySQLdb.Error) as e: raise RecoverableConnectorError( f"Failed to connect to database: {e}")
def retrieve(self): """ Retrieve data from RabbitMQ, maximum MAX_OUTPUT messages at a time. Retry when failing due to a lost connection. :return dict|list: dictionary with the values expected as output of the deployment, or a list of those dictionaries """ # Including the connect in the retrieve method such that it can benefit from retrying if not self.connection: self.connect() data = [] # Collect MAX_OUTPUT messages in one call at most for _ in range(0, int(get_variable('MAX_OUTPUT', '50'))): try: channel = self.connection.channel() method_frame, header_frame, body = channel.basic_get( 'input_connector') except pika.exceptions.AMQPError as e: # If we already collected some messages that were acknowledged, we need to return those now. # Otherwise these will be lost if len(data) > 0: logger.warning( f"Failed to acknowledge message: {e}. Returning already collecting messages" ) break # If nothing was collected yet, we raise an exception to retry self.connection = None raise RecoverableConnectorError( f"Failed to retrieve message from RabbitMQ: {e}") if method_frame: try: channel.basic_ack(method_frame.delivery_tag) except pika.exceptions.AMQPError as e: # If we already collected some messages that were acknowledged, we need to return those now. # Otherwise these will be lost if len(data) > 0: logger.warning( f"Failed to acknowledge message: {e}. Returning already collecting messages" ) break # If nothing was collected yet, we raise an exception to retry self.connection = None raise RecoverableConnectorError( f"Failed to acknowledge message: {e}") data.append(body.decode('utf-8')) else: # No more messages available, end the loop break logger.info(f"Retrieved {len(data)} rows") return data