def create_config_table(self, big_query_client: BigQueryAdapter): big_query_client.set_data_set_ref('mysql_sync') big_query_client.set_table_ref('data_sources') # return true if table already exists if big_query_client.check_table(): return True schema = [ bigquery.SchemaField('service', 'STRING', description="Service Name"), bigquery.SchemaField('data_set', 'STRING', description="Big Query Data Set"), bigquery.SchemaField('host', 'STRING', description="MySQL host connection"), bigquery.SchemaField('user', 'STRING', description="MySQL connection user"), bigquery.SchemaField('password', 'STRING', description="MySQL connection password"), bigquery.SchemaField('database', 'STRING', description="MySQL Database"), bigquery.SchemaField('last_run', 'TIMESTAMP', description="Last produce date"), ] result = big_query_client.create_table(schema) if result: self.sd_logger.info({'message': f"Created Config table"}, { 'class': 'PrefectInstallBigQuery', 'method': 'create_config_table' }) else: self.sd_logger.warning(big_query_client.errors, { 'class': 'PrefectInstallBigQuery', 'method': 'create_config_table' }) return result
def create_tracking_table(self, big_query_client: BigQueryAdapter): big_query_client.set_data_set_ref(self.definitions['data_set']) big_query_client.set_table_ref('sync_tracking_table') # return true if table already exists if big_query_client.check_table(): return True schema = [ bigquery.SchemaField( 'table', 'STRING', description="Tracked Table Name" ), bigquery.SchemaField( 'watched', 'STRING', description="Column to watch to minimize the number of records loaded per sync" ), bigquery.SchemaField( 'primary_id', 'STRING', description="Primary Id Column(s)" ), bigquery.SchemaField( 'synchronize', 'BOOLEAN', description="Flag to Synchronize the table" ) ] result = big_query_client.create_table(schema) if result: self.sd_logger.info( {'message': f"Created tracking table"}, {'class': 'PrefectAddServiceBigQuery', 'method': 'create_tracking_table'} ) else: self.sd_logger.warning( big_query_client.errors, {'class': 'PrefectAddServiceBigQuery', 'method': 'create_tracking_table'} ) return result
class Data(Task): def __init__(self, service: str, **kwargs): self.chunk_size = 250000 self.table = None self.watched_column = None self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-data") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter(service) super().__init__(**kwargs) def check_message(self, message): if 'table' in message: self.table = message['table'] else: error_message = 'Table was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) if 'watched' in message: self.watched_column = message['watched'] else: error_message = 'Watched was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) def get_schema_from_big_query(self) -> bool: self.big_query_client.set_table_ref(self.table) table_check = self.big_query_client.check_table() if table_check: return self.big_query_client.get_schema() return table_check def last_updated_data(self) -> Union[str, bool]: query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}" result = self.big_query_client.query(query) if result: for value in result: if value['last_updated']: return value['last_updated'] else: # return none to allow all records to be pulled at the start return None else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'last_updated_data', 'table': self.table }) return result def get_number_of_records_to_import(self, last_updated) -> Union[int, bool]: result = self.my_sql_client.count_items_to_sync( table=self.table, watched_column=self.watched_column, last_run=last_updated) if self.my_sql_client.errors: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return result def query_mysql_for_records(self, last_updated_date: str, limit: int, offset: int) -> Union[list, bool]: results = self.my_sql_client.get_records( table=self.table, watched_column=self.watched_column, last_run=last_updated_date, limit=limit, offset=offset) if not results: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return results def load_mysql_data_into_data_frame( self, data: list, schema: dict) -> Union[DataFrame, bool]: if len(data[0]) is not len(schema.keys()): self.sd_logger.critical( { 'message': "Schema and data length mismatch", 'schema_length': len(schema.keys()), 'data_length': len(data[0]) }, { 'class': 'Data', 'method': 'load_mysql_data_into_data_frame', 'table': self.table }) return False df = pd.DataFrame.from_records(data, columns=schema.keys()) del data return df def transform_data_frame_to_match_big_query_schema( self, data_frame: DataFrame, schema: dict) -> Union[DataFrame, bool]: try: df = service_helpers.data_frame_to_schema(data_frame, schema) except ValueError as e: self.sd_logger.critical({'message': 'Error: {}'.format(e)}, { 'class': 'Data', 'method': 'transform_data_frame_to_match_big_query_schema', 'table': self.table }) return False return df def append_data_frame_to_big_query(self, data_frame: DataFrame): result = self.big_query_client.upload_data_frame(data_frame) if result: message = f"table:{self.table} | Records written: {data_frame.shape[0]}" self.sd_logger.info({'message': message}, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) return result def write_df_to_storage(self, df: DataFrame) -> bool: storage_client = StorageAdapter(self.service_account) storage_client.get_client() date_time_obj = datetime.datetime.utcnow() location = f'error/csv/{self.table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC' result = storage_client.write_string( bucket=self.definitions['service'] + '-etl', destination=location, string=df.to_csv(), encoding='text/csv') if not result: self.sd_logger.error( storage_client.errors, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return False self.sd_logger.info({'message': f"Failed CSV added to {location}"}, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return result
class Schema(Task): def __init__(self, service: str, **kwargs): self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-schema") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) super().__init__(**kwargs) def get_mysql_schema(self, table: str) -> Union[list, bool]: mysql_client = MySqlAdapter(self.definitions['service']) columns = mysql_client.mysql_table_definition(table) if not columns: self.sd_logger.error(mysql_client.errors, { 'class': 'Schema', 'method': 'get_mysql_schema', 'table': table }) return False return columns @staticmethod def organized_mysql_schema(schema: list) -> list: organized_column_data = [] for column in schema: organized_column_data.append( service_helpers.label_mysql_table_definitions(column)) return organized_column_data @staticmethod def convert_mysql_to_big_query_schema(schema: list) -> list: return service_helpers.generate_bq_schema_from_mysql(schema) def store_mysql_schema(self, schema: list, table: str) -> bool: encoded_schema = json.dumps(schema) storage_client = StorageAdapter(self.service_account) storage_client.get_client() date_time_obj = datetime.utcnow() result = storage_client.write_string( bucket=self.definitions['service'] + '-etl', destination= f'schema/{table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC', string=encoded_schema) if not result: self.sd_logger.error(storage_client.errors, { 'class': 'Schema', 'method': 'store_mysql_schema', 'table': table }) return False return result def check_table_exists(self, table): self.big_query_client.set_table_ref(table) return self.big_query_client.check_table() def get_current_schema(self, table: str): self.big_query_client.set_table_ref(table) return self.big_query_client.get_schema() @staticmethod def compare_schema(new_schema, current_schema) -> bool: # first check is total number of items if len(new_schema) is not len(current_schema): return False # compare column names and types schema_matches = True for x in range(len(new_schema)): exists_in_current = False if current_schema[x].name == new_schema[x].name: if current_schema[x].field_type == new_schema[x].field_type: exists_in_current = True else: exists_in_current = True if not exists_in_current: schema_matches = False return schema_matches def create_table(self, table: str, schema: list) -> bool: self.big_query_client.set_table_ref(table) result = self.big_query_client.create_table(schema=schema, overwrite=True) if result: self.sd_logger.info({'message': f"Table {table} Created"}, { 'class': 'Schema', 'method': 'create_table', 'table': table }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'create_table', 'table': table }) return result @staticmethod def acknowledge_message(message): message.ack() def copy_bq_table(self, table: str): self.big_query_client.set_table_ref(table) copy_table_ref = self.big_query_client.table_ref date_time_obj = datetime.utcnow() destination_str = f'{table}_{date_time_obj.strftime("%m_%d_%Y")}' self.big_query_client.set_table_ref(destination_str) destination_table_ref = self.big_query_client.table_ref result = self.big_query_client.copy_table( copy_table=copy_table_ref, destination_table=destination_table_ref) if result: self.sd_logger.warning( {'message': f"Table {table} copied to {destination_str}"}, { 'class': 'Schema', 'method': 'copy_bq_table', 'table': table }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'copy_bq_table', 'table': table }) return destination_str def backup_table_to_storage(self, table): self.big_query_client.set_table_ref(table) copy_table_ref = self.big_query_client.table_ref date_time_obj = datetime.utcnow() destination = f'gs://{self.definitions["service"]}-etl/data/{table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC_*.avro' result = self.big_query_client.export_table_to_storage( table=copy_table_ref, destination=destination) if result: self.sd_logger.info( {'message': f"Table {table} exported to {destination}"}, { 'class': 'Schema', 'method': 'backup_table_to_storage' }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'backup_table_to_storage' }) return result def delete_table(self, table): self.big_query_client.set_table_ref(table) result = self.big_query_client.delete_table() if result: self.sd_logger.info({'message': f"Table {table} deleted"}, { 'class': 'Schema', 'method': 'delete_table' }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'delete_table' }) return result