Exemplo n.º 1
0
    def insert(self, data):
        """
        Insert given data to GCS

        :param dict data: a dictionary containing the data to be inserted
        """

        # The field 'blob' must be present in the data. It contains the blob file path to be uploaded.
        if 'blob' not in data:
            raise ConnectorError("Field 'blob' is not given in the input")

        file_path = data['blob']
        filename = os.path.basename(file_path)

        bucket = get_variable('BUCKET')
        path_prefix = get_variable('PATH_PREFIX', '')

        # Generate the destination blob name by concatenating (optional) path prefix and filename
        object_path = os.path.join(path_prefix, filename)

        try:
            bucket = self.client.bucket(bucket_name=bucket)
            blob = bucket.blob(object_path, chunk_size=self.chunk_size)
            blob.upload_from_filename(filename=file_path)

        except exceptions.GoogleCloudError as e:
            raise RecoverableConnectorError(f"Failed to insert blob: {e}")

        logger.info("Blob inserted successfully")
Exemplo n.º 2
0
    def insert(self, data):
        """
        Insert given data to MySQL

        :param dict data: a dictionary containing the data to be inserted
        """

        # Including the connect in the insert method such that it can benefit from retrying
        if not self.connection:
            self.connect()

        # Construct the insert query
        columns = ", ".join([f"{col}" for col in data.keys()])
        values = ", ".join(["%s" for _ in data])
        query = f"INSERT INTO {get_variable('TABLE')} ({columns}) " \
                f"VALUES ({values})"

        params = tuple(data.values())

        try:
            cursor = self.connection.cursor()
            cursor.execute(query, params)
            self.connection.commit()
            cursor.close()

        except MySQLdb.OperationalError as e:
            self.connection = None
            raise RecoverableConnectorError(f"Failed to insert data: {e}")

        except MySQLdb.Error as e:
            self.connection = None
            raise ConnectorError(f"Failed to insert data: {e}")

        logger.info("Data inserted successfully")
Exemplo n.º 3
0
    def insert(self, data):
        """
        Insert given data to Kinesis stream

        :param dict data: a dictionary containing the data to be inserted
        """

        # Convert the input data dictionary to a single Kinesis record. Generate the partition key randomly.
        record = {
            'Data':
            json.dumps(data),
            'PartitionKey':
            ''.join(
                random.sample(
                    string.ascii_uppercase + string.ascii_lowercase +
                    string.digits, 16))
        }

        try:
            self.kinesis_client.put_record(StreamName=get_variable('STREAM'),
                                           Data=record['Data'],
                                           PartitionKey=record['PartitionKey'])

        except (botocore.exceptions.ClientError,
                botocore.exceptions.BotoCoreError) as e:
            raise RecoverableConnectorError(
                f"Failed to insert record: {e.response['Error']['Message']}")

        logger.info("Data inserted successfully")
Exemplo n.º 4
0
    def connect(self):
        """
        Connect to RabbitMQ

        :return: a new connection object to RabbitMQ
        """

        parameters = pika.ConnectionParameters(
            host=get_variable('HOST'),
            port=int(get_variable('PORT')),
            credentials=pika.PlainCredentials(
                username=get_variable('USERNAME'),
                password=get_variable('PASSWORD')),
            virtual_host=get_variable('VIRTUAL_HOST', '/'),
            socket_timeout=int(get_variable('TIMEOUT', '10')),
            blocked_connection_timeout=int(get_variable('TIMEOUT', '10')),
            stack_timeout=int(get_variable('TIMEOUT', '10')),
            heartbeat=int(get_variable('TIMEOUT', '10')))

        try:
            self.connection = pika.BlockingConnection(parameters)
        except pika.exceptions.ProbableAuthenticationError as e:
            raise ConnectorError(
                f"Failed to authenticate at RabbitMQ server: {e}")
        except (socket.gaierror, pika.exceptions.AMQPError) as e:
            raise RecoverableConnectorError(
                f"Failed to connect to RabbitMQ: {e}")
Exemplo n.º 5
0
    def insert(self, data):
        """
        Insert given data to Azure Blob Storage

        :param dict data: a dictionary containing the data to be inserted
        """

        # The field 'blob' must be present in the data. It contains the blob file path to be uploaded.
        if 'blob' not in data:
            raise ConnectorError("Field 'blob' is not given in the input")

        file_path = data['blob']
        filename = os.path.basename(file_path)

        path_prefix = get_variable('PATH_PREFIX', '')

        # Generate the object path by concatenating (optional) path prefix and filename
        object_path = os.path.join(path_prefix, filename)

        try:
            blob_client = self.blob_service_client.get_blob_client(
                container=get_variable('CONTAINER'), blob=object_path)

            with open(file_path, "rb") as data:
                blob_client.upload_blob(data=data,
                                        timeout=int(
                                            get_variable('TIMEOUT', '10')))

        except azure.core.exceptions.AzureError as e:
            raise RecoverableConnectorError(f"Failed to insert blob: {e}")

        logger.info("Blob inserted successfully")
Exemplo n.º 6
0
    def insert(self, data):
        """
        Insert given data to S3 storage

        :param dict data: a dictionary containing the data to be inserted
        """

        # The field 'blob' must be present in the data. It contains the blob file path to be uploaded.
        if 'blob' not in data:
            raise ConnectorError("Field 'blob' is not given in the input")

        file_path = data['blob']
        filename = os.path.basename(file_path)

        bucket = get_variable('BUCKET')
        path_prefix = get_variable('PATH_PREFIX', '')

        # Generate the object path by concatenating (optional) path prefix and filename
        object_path = os.path.join(path_prefix, filename)

        try:
            # The upload_file method accepts a file name, a bucket name, and an object name. The method handles large
            # files by splitting them into smaller chunks and uploading each chunk in parallel.
            self.s3_client.upload_file(file_path, bucket, object_path)

        except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e:
            raise RecoverableConnectorError(f"Failed to insert blob: {e}")
        except boto3.exceptions.S3UploadFailedError as e:
            raise ConnectorError(f"Failed to insert blob: {e}")

        logger.info("Blob inserted successfully")
Exemplo n.º 7
0
    def insert(self, data):
        """
        Insert given data to MsSQL

        :param dict data: a dictionary containing the data to be inserted
        """

        # Including the connect in the insert method such that it can benefit from retrying
        if not self.connection:
            self.connect()

        # Construct the insert query
        columns = ", ".join([f"{col}" for col in data.keys()])
        values = ", ".join([f":{col}" for col in data.keys()])
        query = f"INSERT INTO {get_variable('SCHEMA')}.{get_variable('TABLE')} ({columns}) " \
                f"VALUES ({values})"

        try:
            self.connection.execute(sqlalchemy.text(query), **data)

        except (sqlalchemy.exc.ProgrammingError, sqlalchemy.exc.InvalidRequestError, sqlalchemy.exc.StatementError) \
                as e:
            self.connection = None
            raise RecoverableConnectorError(f"Failed to insert data: {e}")

        except (sqlalchemy.exc.DatabaseError, Exception) as e:
            self.connection = None
            raise ConnectorError(f"Failed to insert data: {e}")

        logger.info("Data inserted successfully")
Exemplo n.º 8
0
    def connect(self):
        """
        Connect to MSSQL database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        username = get_variable('USERNAME')
        password = get_variable('PASSWORD')
        host = get_variable('HOST')
        port = get_variable('PORT', '1433')
        database = get_variable('DATABASE')
        timeout = get_variable('TIMEOUT', '60')

        try:
            connection_string = f"mssql+pyodbc://{username}:{password}@{host}:{port}/{database}?" \
                "driver=ODBC Driver 17 for SQL Server"
            # fast_executemany speeds up the insertion up to a 100-fold
            self.connection = sqlalchemy.create_engine(url=connection_string,
                                                       fast_executemany=True,
                                                       connect_args={
                                                           "timeout":
                                                           int(timeout)
                                                       }).connect()
        except (sqlalchemy.exc.OperationalError,
                sqlalchemy.exc.InterfaceError) as e:
            raise RecoverableConnectorError(
                f"Failed to connect to database: {e}")
Exemplo n.º 9
0
    def connect(self):
        """
        Connect to InfluxDB server database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        try:
            self.connection = influxdb_client.InfluxDBClient(
                url=get_variable('URL'),
                token=get_variable('TOKEN'),
                org=get_variable('ORGANIZATION'),
                debug=False).write_api(write_options=SYNCHRONOUS)

        except urllib3.exceptions.HTTPError as e:
            raise RecoverableConnectorError(
                f"Failed to connect to database: {e}")
Exemplo n.º 10
0
    def connect(self):
        """
        Connect to PostgreSQL database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        try:
            self.connection = psycopg2.connect(
                host=get_variable('HOST'),
                port=int(get_variable('PORT', '5432')),
                user=get_variable('USERNAME'),
                password=get_variable('PASSWORD'),
                database=get_variable('DATABASE'),
                connect_timeout=int(get_variable('TIMEOUT', '10'))
            )
        except psycopg2.Error as e:
            raise RecoverableConnectorError(f"Failed to connect to database: {e}")
Exemplo n.º 11
0
    def connect(self):
        """
        Connect to MySQL database

        :return: connection: a new connection object to the database
        """

        self.connection = None

        try:
            self.connection = MySQLdb.connect(
                host=get_variable('HOST'),
                port=int(get_variable('PORT', '3306')),
                user=get_variable('USERNAME'),
                password=get_variable('PASSWORD'),
                database=get_variable('DATABASE'),
                connect_timeout=int(get_variable('TIMEOUT', '10')),
            )
        except (MySQLdb.OperationalError, MySQLdb.Error) as e:
            raise RecoverableConnectorError(
                f"Failed to connect to database: {e}")
Exemplo n.º 12
0
    def insert(self, data):
        """
        Insert given data to BigQuery

        :param dict data: a dictionary containing the data to be inserted
        """

        # Initialize table
        table_name = f"{get_variable('DATASET')}.{get_variable('TABLE')}"

        try:
            table = self.client.get_table(table_name)
            error = self.client.insert_rows(table=table, rows=[data])
        except NotFound:
            raise ConnectorError(f"Table {table_name} was not found")
        except (ValueError, TypeError, GoogleAPIError) as e:
            raise ConnectorError(f"Failed to insert data: {e}")

        if error:
            message = error[0]['errors'][0]['message']
            raise RecoverableConnectorError(
                f"Failed to insert data: {message}")

        logger.info("Data inserted successfully")
Exemplo n.º 13
0
    def retrieve(self):
        """
        Retrieve data from RabbitMQ, maximum MAX_OUTPUT messages at a time. Retry when failing due to a lost
        connection.

        :return dict|list: dictionary with the values expected as output of the deployment, or a list of those
            dictionaries
        """

        # Including the connect in the retrieve method such that it can benefit from retrying
        if not self.connection:
            self.connect()

        data = []

        # Collect MAX_OUTPUT messages in one call at most
        for _ in range(0, int(get_variable('MAX_OUTPUT', '50'))):
            try:
                channel = self.connection.channel()
                method_frame, header_frame, body = channel.basic_get(
                    'input_connector')
            except pika.exceptions.AMQPError as e:
                # If we already collected some messages that were acknowledged, we need to return those now.
                # Otherwise these will be lost
                if len(data) > 0:
                    logger.warning(
                        f"Failed to acknowledge message: {e}. Returning already collecting messages"
                    )
                    break

                # If nothing was collected yet, we raise an exception to retry
                self.connection = None
                raise RecoverableConnectorError(
                    f"Failed to retrieve message from RabbitMQ: {e}")

            if method_frame:
                try:
                    channel.basic_ack(method_frame.delivery_tag)
                except pika.exceptions.AMQPError as e:
                    # If we already collected some messages that were acknowledged, we need to return those now.
                    # Otherwise these will be lost
                    if len(data) > 0:
                        logger.warning(
                            f"Failed to acknowledge message: {e}. Returning already collecting messages"
                        )
                        break

                    # If nothing was collected yet, we raise an exception to retry
                    self.connection = None
                    raise RecoverableConnectorError(
                        f"Failed to acknowledge message: {e}")

                data.append(body.decode('utf-8'))

            else:
                # No more messages available, end the loop
                break

        logger.info(f"Retrieved {len(data)} rows")

        return data