def get_definitions(service: str) -> dict: service_account_obj = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) storage = StorageAdapter(service_account_obj) storage.get_client() key = storage.get_file_as_string(bucket="mysql_sync_keys", file=service) if key is False: raise RuntimeError(storage.errors) crypto = Fernet(key) bqa = BigQueryAdapter(service_account_obj) bqa.get_client() query = f"SELECT * FROM mysql_sync.data_sources WHERE service = '{service}'" result = bqa.query(query) if result is False: raise RuntimeError(bqa.errors) definitions = {} for item in result: definitions['service'] = item['service'] definitions['data_set'] = item['data_set'] definitions['database'] = item['database'] definitions['host'] = crypto.decrypt( bytes(item['host'], encoding='utf-8')) definitions['user'] = crypto.decrypt( bytes(item['user'], encoding='utf-8')) definitions['password'] = crypto.decrypt( bytes(item['password'], encoding='utf-8')) return definitions
def set_last_run(self, service_name: str): big_query_client = BigQueryAdapter(self.service_account) big_query_client.get_client() query = f"UPDATE mysql_sync.data_sources SET `last_run` = CURRENT_TIMESTAMP where `service`= '{service_name}'" result = big_query_client.query(query) if not result: self.sd_logger.error(big_query_client.errors, { 'class': 'PrefectProduceSchemaJobs', 'method': 'set_last_run' }) return result
def get_items_to_be_queued(self, big_query_client: BigQueryAdapter): query = [ "SELECT `table`, `watched`, `primary_id`", f"FROM `{self.definitions['data_set']}`.`sync_tracking_table`", "WHERE `synchronize` is true", "ORDER BY `table` ASC" ] result = big_query_client.query(' '.join(query)) if not result: self.sd_logger.error( big_query_client.errors, { 'class': 'PrefectProduceSchemaJobs', 'method': 'get_items_to_be_queued' }) return result
def get_service_to_produce(self) -> str: big_query_client = BigQueryAdapter(self.service_account) big_query_client.get_client() query = 'SELECT * from mysql_sync.data_sources order by `last_run` limit 1' result = big_query_client.query(query) if not result: self.sd_logger.error( big_query_client.errors, { 'class': 'PrefectProduceSchemaJobs', 'method': 'get_service_to_produce' }) RuntimeError(big_query_client.errors) for item in result: if hasattr(item, 'service'): return item.service else: raise RuntimeError( 'Service was not found in mysql_sync.data_sources')
class Data(Task): def __init__(self, service: str, **kwargs): self.chunk_size = 250000 self.table = None self.watched_column = None self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-data") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter(service) super().__init__(**kwargs) def check_message(self, message): if 'table' in message: self.table = message['table'] else: error_message = 'Table was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) if 'watched' in message: self.watched_column = message['watched'] else: error_message = 'Watched was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) def get_schema_from_big_query(self) -> bool: self.big_query_client.set_table_ref(self.table) table_check = self.big_query_client.check_table() if table_check: return self.big_query_client.get_schema() return table_check def last_updated_data(self) -> Union[str, bool]: query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}" result = self.big_query_client.query(query) if result: for value in result: if value['last_updated']: return value['last_updated'] else: # return none to allow all records to be pulled at the start return None else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'last_updated_data', 'table': self.table }) return result def get_number_of_records_to_import(self, last_updated) -> Union[int, bool]: result = self.my_sql_client.count_items_to_sync( table=self.table, watched_column=self.watched_column, last_run=last_updated) if self.my_sql_client.errors: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return result def query_mysql_for_records(self, last_updated_date: str, limit: int, offset: int) -> Union[list, bool]: results = self.my_sql_client.get_records( table=self.table, watched_column=self.watched_column, last_run=last_updated_date, limit=limit, offset=offset) if not results: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return results def load_mysql_data_into_data_frame( self, data: list, schema: dict) -> Union[DataFrame, bool]: if len(data[0]) is not len(schema.keys()): self.sd_logger.critical( { 'message': "Schema and data length mismatch", 'schema_length': len(schema.keys()), 'data_length': len(data[0]) }, { 'class': 'Data', 'method': 'load_mysql_data_into_data_frame', 'table': self.table }) return False df = pd.DataFrame.from_records(data, columns=schema.keys()) del data return df def transform_data_frame_to_match_big_query_schema( self, data_frame: DataFrame, schema: dict) -> Union[DataFrame, bool]: try: df = service_helpers.data_frame_to_schema(data_frame, schema) except ValueError as e: self.sd_logger.critical({'message': 'Error: {}'.format(e)}, { 'class': 'Data', 'method': 'transform_data_frame_to_match_big_query_schema', 'table': self.table }) return False return df def append_data_frame_to_big_query(self, data_frame: DataFrame): result = self.big_query_client.upload_data_frame(data_frame) if result: message = f"table:{self.table} | Records written: {data_frame.shape[0]}" self.sd_logger.info({'message': message}, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) return result def write_df_to_storage(self, df: DataFrame) -> bool: storage_client = StorageAdapter(self.service_account) storage_client.get_client() date_time_obj = datetime.datetime.utcnow() location = f'error/csv/{self.table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC' result = storage_client.write_string( bucket=self.definitions['service'] + '-etl', destination=location, string=df.to_csv(), encoding='text/csv') if not result: self.sd_logger.error( storage_client.errors, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return False self.sd_logger.info({'message': f"Failed CSV added to {location}"}, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return result
class Audit(Task): def __init__(self, service: str, **kwargs): self.table = None self.watched_column = None self.primary_id = None self.sample_size = 1000 self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter() super().__init__(**kwargs) def extract_message_values(self, message: dict) -> bool: if self.table is None: if 'table' in message: self.table = message['table'] else: self.sd_logger.error( { 'error': 'Key: table was not found in message', 'data': message }, { 'class': 'Audit', 'method': 'extract_message_values' }) return False if 'watched' in message: self.watched_column = message['watched'] else: self.sd_logger.error( { 'error': 'Key: watched was not found in message', 'data': message }, { 'class': 'Audit', 'method': 'extract_message_values', "table": self.table }) return False if 'primary_id' in message: self.primary_id = message['primary_id'] else: self.sd_logger.error( { 'error': 'Key: primary_id was not found in message', 'data': message }, { 'class': 'Audit', 'method': 'extract_message_values', "table": self.table }) return False return True def big_query_last_updated_data(self) -> Union[datetime.datetime, bool]: query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}" result = self.big_query_client.query(query) if result: for value in result: if value.last_updated: return value.last_updated else: # use string of the start of unix time as the default time return False else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Audit', 'method': 'big_query_last_updated_data', 'table': self.table }) return result def get_total_distinct_mysql( self, last_updated: datetime.datetime) -> Union[int, bool]: if last_updated: records = self.my_sql_client.count_distinct( table=self.table, index=self.primary_id, watched_column=self.watched_column, last_updated=last_updated) else: records = self.my_sql_client.count_distinct(table=self.table, index=self.primary_id) if records is False: self.sd_logger.error({'error': self.my_sql_client.errors}, { 'class': 'Audit', 'method': 'get_total_distinct_mysql', "table": self.table }) return records def get_total_distinct_big_query(self) -> int: query = [ "SELECT COUNT(*) as total_count", "FROM (", f"select DISTINCT {self.primary_id}", f"FROM {self.definitions['data_set']}.{self.table}", ")" ] result = self.big_query_client.query(' '.join(query)) if not result: self.sd_logger.error({'error': self.big_query_client.errors}, { 'class': 'Audit', 'method': 'get_total_distinct_big_query', "table": self.table }) return False for item in result: return item.total_count def audit_from_totals(self, total_mysql: int, total_big_query: int) -> bool: # check if the totals are the same if total_mysql == total_big_query: return True # big_query total will be lower than the mysql most of the time # allow it is be passing_total = int(round((total_mysql * .99), 0)) if total_big_query >= passing_total: return True self.sd_logger.info( { 'message': f"Audit totals did not match: {self.table}", "data": { "MySQL Total": total_mysql, "Big Query Total": total_big_query, "passing total": passing_total } }, { 'class': 'Audit', 'method': 'audit_from_totals', "table": self.table }) return False def get_random_mysql(self, last_run: int = None) -> Union[list, bool]: result = self.my_sql_client.get_random_records( table=self.table, limit=self.sample_size, index=self.primary_id, watched=self.watched_column, last_updated=last_run) if result is not False: return result self.sd_logger.error({'error': self.my_sql_client.errors}, { 'class': 'Audit', 'method': 'get_random_mysql', "table": self.table }) return False def get_big_query_records_from_sample(self, sample: list) -> DataFrame: primary_ids = self.primary_id.split(',') or_statements = [] # get the same items from the mysql random query in big query for item in sample: pk_statements = [] for key in primary_ids: if type(item[key]) is int: pk_statements.append(f"`{key}` = {item[key]}") else: pk_statements.append(f"`{key}` = '{item[key]}'") if type(item[self.watched_column]) is int: watched_condition = f"AND `{self.watched_column}` = {item[self.watched_column]}" else: if item[self.watched_column] is None: watched_condition = f"AND `{self.watched_column}` IS NULL" else: watched_condition = f"AND `{self.watched_column}` = '{item[self.watched_column]}'" or_statements.append( f"({' AND '.join(pk_statements)} {watched_condition})") query = [ f"SELECT *", f"FROM {self.definitions['data_set']}.{self.table}", f"WHERE {' OR '.join(or_statements)}", "ORDER BY" ] order_by = [] for key in primary_ids: order_by.append(f"{key} ASC") query.append(', '.join(order_by)) result = self.big_query_client.query(' '.join(query)) if not result: self.sd_logger.error({'error': self.big_query_client.errors}, { 'class': 'Audit', 'method': 'get_big_query_records_from_sample', "table": self.table }) return False output = result.to_dataframe() return output def convert_mysql_to_data_frame( self, mysql_data: list) -> Union[DataFrame, bool]: self.big_query_client.set_table_ref(self.table) bq_schema = self.big_query_client.get_schema() schema = service_helpers.convert_schema_to_dict(bq_schema) if len(mysql_data[0]) is not len(schema.keys()): self.sd_logger.critical( { 'message': "Schema and data length mismatch", 'schema_length': len(schema.keys()), 'data_length': len(mysql_data[0]) }, { 'class': 'Audit', 'method': 'convert_mysql_to_data_frame', 'table': self.table }) return False data_frame = pd.DataFrame.from_records(mysql_data, columns=schema.keys()) try: df = service_helpers.data_frame_to_schema(data_frame, schema, utc=True) except ValueError as e: self.sd_logger.critical({'message': 'Error: {}'.format(e)}, { 'class': 'Audit', 'method': 'transform_data_frame_to_match_big_query_schema', 'table': self.table }) return False return df def audit_from_sample(self, mysql_data: DataFrame, big_query_data: DataFrame) -> bool: # test if data frames are equal - this will fail in most cases to do the poor data quality from sources result = mysql_data.equals(big_query_data) if result: return True # check if the number of returned rows are the same # this check is useful because of the strictness of the queries that generate these DataFrames if mysql_data.shape[0] == big_query_data.shape[0]: return True self.sd_logger.info( { 'message': f"Audit Failure: {self.table}", 'DataFrame.equals': result, 'mysql.shape': mysql_data.shape[0], 'big_query.shape': big_query_data.shape[0] }, { 'class': 'Audit', 'method': 'audit_from_sample', 'table': self.table }) return False