def insert(self, data): """ Insert given data to GCS :param dict data: a dictionary containing the data to be inserted """ # The field 'blob' must be present in the data. It contains the blob file path to be uploaded. if 'blob' not in data: raise ConnectorError("Field 'blob' is not given in the input") file_path = data['blob'] filename = os.path.basename(file_path) bucket = get_variable('BUCKET') path_prefix = get_variable('PATH_PREFIX', '') # Generate the destination blob name by concatenating (optional) path prefix and filename object_path = os.path.join(path_prefix, filename) try: bucket = self.client.bucket(bucket_name=bucket) blob = bucket.blob(object_path, chunk_size=self.chunk_size) blob.upload_from_filename(filename=file_path) except exceptions.GoogleCloudError as e: raise RecoverableConnectorError(f"Failed to insert blob: {e}") logger.info("Blob inserted successfully")
def insert(self, data): """ Insert given data to MySQL :param dict data: a dictionary containing the data to be inserted """ # Including the connect in the insert method such that it can benefit from retrying if not self.connection: self.connect() # Construct the insert query columns = ", ".join([f"{col}" for col in data.keys()]) values = ", ".join(["%s" for _ in data]) query = f"INSERT INTO {get_variable('TABLE')} ({columns}) " \ f"VALUES ({values})" params = tuple(data.values()) try: cursor = self.connection.cursor() cursor.execute(query, params) self.connection.commit() cursor.close() except MySQLdb.OperationalError as e: self.connection = None raise RecoverableConnectorError(f"Failed to insert data: {e}") except MySQLdb.Error as e: self.connection = None raise ConnectorError(f"Failed to insert data: {e}") logger.info("Data inserted successfully")
def insert(self, data): """ Insert given data to Kinesis stream :param dict data: a dictionary containing the data to be inserted """ # Convert the input data dictionary to a single Kinesis record. Generate the partition key randomly. record = { 'Data': json.dumps(data), 'PartitionKey': ''.join( random.sample( string.ascii_uppercase + string.ascii_lowercase + string.digits, 16)) } try: self.kinesis_client.put_record(StreamName=get_variable('STREAM'), Data=record['Data'], PartitionKey=record['PartitionKey']) except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e: raise RecoverableConnectorError( f"Failed to insert record: {e.response['Error']['Message']}") logger.info("Data inserted successfully")
def connect(self): """ Connect to RabbitMQ :return: a new connection object to RabbitMQ """ parameters = pika.ConnectionParameters( host=get_variable('HOST'), port=int(get_variable('PORT')), credentials=pika.PlainCredentials( username=get_variable('USERNAME'), password=get_variable('PASSWORD')), virtual_host=get_variable('VIRTUAL_HOST', '/'), socket_timeout=int(get_variable('TIMEOUT', '10')), blocked_connection_timeout=int(get_variable('TIMEOUT', '10')), stack_timeout=int(get_variable('TIMEOUT', '10')), heartbeat=int(get_variable('TIMEOUT', '10'))) try: self.connection = pika.BlockingConnection(parameters) except pika.exceptions.ProbableAuthenticationError as e: raise ConnectorError( f"Failed to authenticate at RabbitMQ server: {e}") except (socket.gaierror, pika.exceptions.AMQPError) as e: raise RecoverableConnectorError( f"Failed to connect to RabbitMQ: {e}")
def insert(self, data): """ Insert given data to Azure Blob Storage :param dict data: a dictionary containing the data to be inserted """ # The field 'blob' must be present in the data. It contains the blob file path to be uploaded. if 'blob' not in data: raise ConnectorError("Field 'blob' is not given in the input") file_path = data['blob'] filename = os.path.basename(file_path) path_prefix = get_variable('PATH_PREFIX', '') # Generate the object path by concatenating (optional) path prefix and filename object_path = os.path.join(path_prefix, filename) try: blob_client = self.blob_service_client.get_blob_client( container=get_variable('CONTAINER'), blob=object_path) with open(file_path, "rb") as data: blob_client.upload_blob(data=data, timeout=int( get_variable('TIMEOUT', '10'))) except azure.core.exceptions.AzureError as e: raise RecoverableConnectorError(f"Failed to insert blob: {e}") logger.info("Blob inserted successfully")
def insert(self, data): """ Insert given data to S3 storage :param dict data: a dictionary containing the data to be inserted """ # The field 'blob' must be present in the data. It contains the blob file path to be uploaded. if 'blob' not in data: raise ConnectorError("Field 'blob' is not given in the input") file_path = data['blob'] filename = os.path.basename(file_path) bucket = get_variable('BUCKET') path_prefix = get_variable('PATH_PREFIX', '') # Generate the object path by concatenating (optional) path prefix and filename object_path = os.path.join(path_prefix, filename) try: # The upload_file method accepts a file name, a bucket name, and an object name. The method handles large # files by splitting them into smaller chunks and uploading each chunk in parallel. self.s3_client.upload_file(file_path, bucket, object_path) except (botocore.exceptions.ClientError, botocore.exceptions.BotoCoreError) as e: raise RecoverableConnectorError(f"Failed to insert blob: {e}") except boto3.exceptions.S3UploadFailedError as e: raise ConnectorError(f"Failed to insert blob: {e}") logger.info("Blob inserted successfully")
def insert(self, data): """ Insert given data to MsSQL :param dict data: a dictionary containing the data to be inserted """ # Including the connect in the insert method such that it can benefit from retrying if not self.connection: self.connect() # Construct the insert query columns = ", ".join([f"{col}" for col in data.keys()]) values = ", ".join([f":{col}" for col in data.keys()]) query = f"INSERT INTO {get_variable('SCHEMA')}.{get_variable('TABLE')} ({columns}) " \ f"VALUES ({values})" try: self.connection.execute(sqlalchemy.text(query), **data) except (sqlalchemy.exc.ProgrammingError, sqlalchemy.exc.InvalidRequestError, sqlalchemy.exc.StatementError) \ as e: self.connection = None raise RecoverableConnectorError(f"Failed to insert data: {e}") except (sqlalchemy.exc.DatabaseError, Exception) as e: self.connection = None raise ConnectorError(f"Failed to insert data: {e}") logger.info("Data inserted successfully")
def connect(self): """ Connect to MSSQL database :return: connection: a new connection object to the database """ self.connection = None username = get_variable('USERNAME') password = get_variable('PASSWORD') host = get_variable('HOST') port = get_variable('PORT', '1433') database = get_variable('DATABASE') timeout = get_variable('TIMEOUT', '60') try: connection_string = f"mssql+pyodbc://{username}:{password}@{host}:{port}/{database}?" \ "driver=ODBC Driver 17 for SQL Server" # fast_executemany speeds up the insertion up to a 100-fold self.connection = sqlalchemy.create_engine(url=connection_string, fast_executemany=True, connect_args={ "timeout": int(timeout) }).connect() except (sqlalchemy.exc.OperationalError, sqlalchemy.exc.InterfaceError) as e: raise RecoverableConnectorError( f"Failed to connect to database: {e}")
def connect(self): """ Connect to InfluxDB server database :return: connection: a new connection object to the database """ self.connection = None try: self.connection = influxdb_client.InfluxDBClient( url=get_variable('URL'), token=get_variable('TOKEN'), org=get_variable('ORGANIZATION'), debug=False).write_api(write_options=SYNCHRONOUS) except urllib3.exceptions.HTTPError as e: raise RecoverableConnectorError( f"Failed to connect to database: {e}")
def connect(self): """ Connect to PostgreSQL database :return: connection: a new connection object to the database """ self.connection = None try: self.connection = psycopg2.connect( host=get_variable('HOST'), port=int(get_variable('PORT', '5432')), user=get_variable('USERNAME'), password=get_variable('PASSWORD'), database=get_variable('DATABASE'), connect_timeout=int(get_variable('TIMEOUT', '10')) ) except psycopg2.Error as e: raise RecoverableConnectorError(f"Failed to connect to database: {e}")
def connect(self): """ Connect to MySQL database :return: connection: a new connection object to the database """ self.connection = None try: self.connection = MySQLdb.connect( host=get_variable('HOST'), port=int(get_variable('PORT', '3306')), user=get_variable('USERNAME'), password=get_variable('PASSWORD'), database=get_variable('DATABASE'), connect_timeout=int(get_variable('TIMEOUT', '10')), ) except (MySQLdb.OperationalError, MySQLdb.Error) as e: raise RecoverableConnectorError( f"Failed to connect to database: {e}")
def insert(self, data): """ Insert given data to BigQuery :param dict data: a dictionary containing the data to be inserted """ # Initialize table table_name = f"{get_variable('DATASET')}.{get_variable('TABLE')}" try: table = self.client.get_table(table_name) error = self.client.insert_rows(table=table, rows=[data]) except NotFound: raise ConnectorError(f"Table {table_name} was not found") except (ValueError, TypeError, GoogleAPIError) as e: raise ConnectorError(f"Failed to insert data: {e}") if error: message = error[0]['errors'][0]['message'] raise RecoverableConnectorError( f"Failed to insert data: {message}") logger.info("Data inserted successfully")
def retrieve(self): """ Retrieve data from RabbitMQ, maximum MAX_OUTPUT messages at a time. Retry when failing due to a lost connection. :return dict|list: dictionary with the values expected as output of the deployment, or a list of those dictionaries """ # Including the connect in the retrieve method such that it can benefit from retrying if not self.connection: self.connect() data = [] # Collect MAX_OUTPUT messages in one call at most for _ in range(0, int(get_variable('MAX_OUTPUT', '50'))): try: channel = self.connection.channel() method_frame, header_frame, body = channel.basic_get( 'input_connector') except pika.exceptions.AMQPError as e: # If we already collected some messages that were acknowledged, we need to return those now. # Otherwise these will be lost if len(data) > 0: logger.warning( f"Failed to acknowledge message: {e}. Returning already collecting messages" ) break # If nothing was collected yet, we raise an exception to retry self.connection = None raise RecoverableConnectorError( f"Failed to retrieve message from RabbitMQ: {e}") if method_frame: try: channel.basic_ack(method_frame.delivery_tag) except pika.exceptions.AMQPError as e: # If we already collected some messages that were acknowledged, we need to return those now. # Otherwise these will be lost if len(data) > 0: logger.warning( f"Failed to acknowledge message: {e}. Returning already collecting messages" ) break # If nothing was collected yet, we raise an exception to retry self.connection = None raise RecoverableConnectorError( f"Failed to acknowledge message: {e}") data.append(body.decode('utf-8')) else: # No more messages available, end the loop break logger.info(f"Retrieved {len(data)} rows") return data