Exemplo n.º 1
0
    def insert(self, data):
        """
        Insert given data to Azure Blob Storage

        :param dict data: a dictionary containing the data to be inserted
        """

        # The field 'blob' must be present in the data. It contains the blob file path to be uploaded.
        if 'blob' not in data:
            raise ConnectorError("Field 'blob' is not given in the input")

        file_path = data['blob']
        filename = os.path.basename(file_path)

        path_prefix = get_variable('PATH_PREFIX', '')

        # Generate the object path by concatenating (optional) path prefix and filename
        object_path = os.path.join(path_prefix, filename)

        try:
            blob_client = self.blob_service_client.get_blob_client(
                container=get_variable('CONTAINER'), blob=object_path)

            with open(file_path, "rb") as data:
                blob_client.upload_blob(data=data,
                                        timeout=int(
                                            get_variable('TIMEOUT', '10')))

        except azure.core.exceptions.AzureError as e:
            raise RecoverableConnectorError(f"Failed to insert blob: {e}")

        logger.info("Blob inserted successfully")
Exemplo n.º 2
0
    def insert(self, data):
        """
        Insert given data to GCS

        :param dict data: a dictionary containing the data to be inserted
        """

        # The field 'blob' must be present in the data. It contains the blob file path to be uploaded.
        if 'blob' not in data:
            raise ConnectorError("Field 'blob' is not given in the input")

        file_path = data['blob']
        filename = os.path.basename(file_path)

        bucket = get_variable('BUCKET')
        path_prefix = get_variable('PATH_PREFIX', '')

        # Generate the destination blob name by concatenating (optional) path prefix and filename
        object_path = os.path.join(path_prefix, filename)

        try:
            bucket = self.client.bucket(bucket_name=bucket)
            blob = bucket.blob(object_path, chunk_size=self.chunk_size)
            blob.upload_from_filename(filename=file_path)

        except exceptions.GoogleCloudError as e:
            raise RecoverableConnectorError(f"Failed to insert blob: {e}")

        logger.info("Blob inserted successfully")
Exemplo n.º 3
0
    def insert(self, data):
        """
        Insert given data to S3 storage

        :param dict data: a dictionary containing the data to be inserted
        """

        # The field 'blob' must be present in the data. It contains the blob file path to be uploaded.
        if 'blob' not in data:
            raise ConnectorError("Field 'blob' is not given in the input")

        file_path = data['blob']
        filename = os.path.basename(file_path)

        bucket = get_variable('BUCKET')
        path_prefix = get_variable('PATH_PREFIX', '')

        # Generate the object path by concatenating (optional) path prefix and filename
        object_path = os.path.join(path_prefix, filename)

        try:
            # The upload_file method accepts a file name, a bucket name, and an object name. The method handles large
            # files by splitting them into smaller chunks and uploading each chunk in parallel.
            self.s3_client.upload_file(file_path, bucket, object_path)

        except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e:
            raise RecoverableConnectorError(f"Failed to insert blob: {e}")
        except boto3.exceptions.S3UploadFailedError as e:
            raise ConnectorError(f"Failed to insert blob: {e}")

        logger.info("Blob inserted successfully")
Exemplo n.º 4
0
    def connect(self):
        """
        Connect to RabbitMQ

        :return: a new connection object to RabbitMQ
        """

        parameters = pika.ConnectionParameters(
            host=get_variable('HOST'),
            port=int(get_variable('PORT')),
            credentials=pika.PlainCredentials(
                username=get_variable('USERNAME'),
                password=get_variable('PASSWORD')),
            virtual_host=get_variable('VIRTUAL_HOST', '/'),
            socket_timeout=int(get_variable('TIMEOUT', '10')),
            blocked_connection_timeout=int(get_variable('TIMEOUT', '10')),
            stack_timeout=int(get_variable('TIMEOUT', '10')),
            heartbeat=int(get_variable('TIMEOUT', '10')))

        try:
            self.connection = pika.BlockingConnection(parameters)
        except pika.exceptions.ProbableAuthenticationError as e:
            raise ConnectorError(
                f"Failed to authenticate at RabbitMQ server: {e}")
        except (socket.gaierror, pika.exceptions.AMQPError) as e:
            raise RecoverableConnectorError(
                f"Failed to connect to RabbitMQ: {e}")
Exemplo n.º 5
0
    def insert(self, data):
        """
        Insert given data to Kinesis stream

        :param dict data: a dictionary containing the data to be inserted
        """

        # Convert the input data dictionary to a single Kinesis record. Generate the partition key randomly.
        record = {
            'Data':
            json.dumps(data),
            'PartitionKey':
            ''.join(
                random.sample(
                    string.ascii_uppercase + string.ascii_lowercase +
                    string.digits, 16))
        }

        try:
            self.kinesis_client.put_record(StreamName=get_variable('STREAM'),
                                           Data=record['Data'],
                                           PartitionKey=record['PartitionKey'])

        except (botocore.exceptions.ClientError,
                botocore.exceptions.BotoCoreError) as e:
            raise RecoverableConnectorError(
                f"Failed to insert record: {e.response['Error']['Message']}")

        logger.info("Data inserted successfully")
Exemplo n.º 6
0
    def setup(self):
        """
        Connect to Amazon Kinesis Stream

        :return: boto3.client client: the client representing the Kinesis stream
        """

        try:
            return boto3.client(
                service_name='kinesis',
                region_name=get_variable('REGION'),
                aws_access_key_id=get_variable('ACCESS_KEY'),
                aws_secret_access_key=get_variable('SECRET_KEY'))
        except (botocore.exceptions.ClientError,
                botocore.exceptions.BotoCoreError) as e:
            raise ConnectorError(f"Failed to initialise Kinesis client {e}")
Exemplo n.º 7
0
    def setup(self):
        """
        Setup the S3 client

        :return: boto3.client client: the client representing the S3 storage
        """

        try:
            return boto3.client(
                service_name='s3',
                region_name=get_variable('REGION'),
                aws_access_key_id=get_variable('ACCESS_KEY'),
                aws_secret_access_key=get_variable('SECRET_KEY')
            )
        except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e:
            raise ConnectorError(f"Failed to initialise S3 client {e}")
Exemplo n.º 8
0
    def connect(self):
        """
        Connect to InfluxDB server database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        try:
            self.connection = influxdb_client.InfluxDBClient(
                url=get_variable('URL'),
                token=get_variable('TOKEN'),
                org=get_variable('ORGANIZATION'),
                debug=False).write_api(write_options=SYNCHRONOUS)

        except urllib3.exceptions.HTTPError as e:
            raise RecoverableConnectorError(
                f"Failed to connect to database: {e}")
Exemplo n.º 9
0
    def insert(self, data):
        """
        Insert given data to InfluxDB bucket. If the insert fails due to lost database connection, retry inserting.

        :param dict data: a dictionary containing the data to be inserted
        """

        # Including the connect in the insert method such that it can benefit from retrying
        if not self.connection:
            self.connect()

        point_values = {}

        if 'time' in data:
            point_values['time'] = data['time']

        if 'tags' in data and data['tags']:
            point_values['tags'] = self._extract_elements(
                elements_csv=data['tags'])

        try:
            point_values['measurement'] = data['measurement']

            fields = self._extract_elements(elements_csv=data['fields'],
                                            elements_type='fields')
            if fields:
                point_values['fields'] = fields
            else:
                raise ConnectorError(
                    "Request input field named 'fields' must contain at least one element"
                )

            point = influxdb_client.Point.from_dict(
                dictionary=point_values,
                write_precision=self._match_write_precision(data))
            self.connection.write(get_variable('BUCKET'), record=point)

        except dateutil.parser._parser.ParserError:
            self.connection = None
            raise ConnectorError(
                f"Failed to insert data: request input field 'time' is wrongly formatted"
            )

        except KeyError as e:
            self.connection = None
            raise ConnectorError(
                f"Failed to insert data: request data missing required input field named {e}"
            )

        except influxdb_client.rest.ApiException as e:
            self.connection = None
            raise ConnectorError(f"Failed to insert data: {e}")

        logger.info("Data inserted successfully")
Exemplo n.º 10
0
    def connect(self):
        """
        Connect to MSSQL database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        username = get_variable('USERNAME')
        password = get_variable('PASSWORD')
        host = get_variable('HOST')
        port = get_variable('PORT', '1433')
        database = get_variable('DATABASE')
        timeout = get_variable('TIMEOUT', '60')

        try:
            connection_string = f"mssql+pyodbc://{username}:{password}@{host}:{port}/{database}?" \
                "driver=ODBC Driver 17 for SQL Server"
            # fast_executemany speeds up the insertion up to a 100-fold
            self.connection = sqlalchemy.create_engine(url=connection_string,
                                                       fast_executemany=True,
                                                       connect_args={
                                                           "timeout":
                                                           int(timeout)
                                                       }).connect()
        except (sqlalchemy.exc.OperationalError,
                sqlalchemy.exc.InterfaceError) as e:
            raise RecoverableConnectorError(
                f"Failed to connect to database: {e}")
Exemplo n.º 11
0
    def setup(self):
        """
        Setup the Azure Blob Storage client

        :return: azure.storage.blob.BlobServiceClient client: a blob service client
        """

        try:
            return azure.storage.blob.BlobServiceClient.from_connection_string(
                conn_str=get_variable('CONNECTION_STRING', ''))
        except azure.core.exceptions.AzureError as e:
            raise ConnectorError(
                f"Failed to initialise Azure Storage client: {e}")
Exemplo n.º 12
0
    def setup(self):
        """
        Setup the GCS client

        :return: storage.Client client: the client which is created with the retrieved JSON credentials
        """

        try:
            self.key_file.write(
                bytearray(get_variable('JSON_KEY_FILE'), 'utf-8'))
            self.key_file.flush()
            return storage.Client.from_service_account_json(
                json_credentials_path=self.key_file.name)
        except (json.decoder.JSONDecodeError, TypeError, ValueError) as e:
            raise ConnectorError(f"Failed to initialise GCS client: {e}")
Exemplo n.º 13
0
    def connect(self):
        """
        Connect to PostgreSQL database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        try:
            self.connection = psycopg2.connect(
                host=get_variable('HOST'),
                port=int(get_variable('PORT', '5432')),
                user=get_variable('USERNAME'),
                password=get_variable('PASSWORD'),
                database=get_variable('DATABASE'),
                connect_timeout=int(get_variable('TIMEOUT', '10'))
            )
        except psycopg2.Error as e:
            raise RecoverableConnectorError(f"Failed to connect to database: {e}")
Exemplo n.º 14
0
    def connect(self):
        """
        Connect to MySQL database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        try:
            self.connection = MySQLdb.connect(
                host=get_variable('HOST'),
                port=int(get_variable('PORT', '3306')),
                user=get_variable('USERNAME'),
                password=get_variable('PASSWORD'),
                database=get_variable('DATABASE'),
                connect_timeout=int(get_variable('TIMEOUT', '10')),
            )
        except (MySQLdb.OperationalError, MySQLdb.Error) as e:
            raise RecoverableConnectorError(
                f"Failed to connect to database: {e}")
Exemplo n.º 15
0
    def retrieve(self):
        """
        Retrieve data from RabbitMQ, maximum MAX_OUTPUT messages at a time. Retry when failing due to a lost
        connection.

        :return dict|list: dictionary with the values expected as output of the deployment, or a list of those
            dictionaries
        """

        # Including the connect in the retrieve method such that it can benefit from retrying
        if not self.connection:
            self.connect()

        data = []

        # Collect MAX_OUTPUT messages in one call at most
        for _ in range(0, int(get_variable('MAX_OUTPUT', '50'))):
            try:
                channel = self.connection.channel()
                method_frame, header_frame, body = channel.basic_get(
                    'input_connector')
            except pika.exceptions.AMQPError as e:
                # If we already collected some messages that were acknowledged, we need to return those now.
                # Otherwise these will be lost
                if len(data) > 0:
                    logger.warning(
                        f"Failed to acknowledge message: {e}. Returning already collecting messages"
                    )
                    break

                # If nothing was collected yet, we raise an exception to retry
                self.connection = None
                raise RecoverableConnectorError(
                    f"Failed to retrieve message from RabbitMQ: {e}")

            if method_frame:
                try:
                    channel.basic_ack(method_frame.delivery_tag)
                except pika.exceptions.AMQPError as e:
                    # If we already collected some messages that were acknowledged, we need to return those now.
                    # Otherwise these will be lost
                    if len(data) > 0:
                        logger.warning(
                            f"Failed to acknowledge message: {e}. Returning already collecting messages"
                        )
                        break

                    # If nothing was collected yet, we raise an exception to retry
                    self.connection = None
                    raise RecoverableConnectorError(
                        f"Failed to acknowledge message: {e}")

                data.append(body.decode('utf-8'))

            else:
                # No more messages available, end the loop
                break

        logger.info(f"Retrieved {len(data)} rows")

        return data