def write_to_big_query(self, data: pd.DataFrame) -> bool: bq = BigQueryAdapter(self.service_account) bq.get_client() bq.set_data_set_ref(self.definitions['data_set']) bq.set_table_ref('sync_tracking_table') result = bq.upload_data_frame(data, 'replace') if result: self.sd_logger.info( {"Tracking table populated": data.count(axis=0).to_dict()}, { 'class': 'PopulateTrackingTable', 'method': 'write_to_big_query' }) else: self.sd_logger.error(bq.errors, { 'class': 'PopulateTrackingTable', 'method': 'write_to_big_query' }) return result
def save_record(self): record = { 'service': [self.service], 'data_set': [self.data_set], 'host': [self.host], 'user': [self.user], 'password': [self.password], 'database': [self.database] } df = pd.DataFrame(record) big_query = BigQueryAdapter(self.service_account) big_query.get_client() big_query.set_data_set_ref('mysql_sync') big_query.set_table_ref('data_sources') result = big_query.upload_data_frame(df) if result is False: self.sd_logger.error(big_query.errors, { 'class': 'AddService', 'method': 'save_record' })
class Data(Task): def __init__(self, service: str, **kwargs): self.chunk_size = 250000 self.table = None self.watched_column = None self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-data") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter(service) super().__init__(**kwargs) def check_message(self, message): if 'table' in message: self.table = message['table'] else: error_message = 'Table was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) if 'watched' in message: self.watched_column = message['watched'] else: error_message = 'Watched was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) def get_schema_from_big_query(self) -> bool: self.big_query_client.set_table_ref(self.table) table_check = self.big_query_client.check_table() if table_check: return self.big_query_client.get_schema() return table_check def last_updated_data(self) -> Union[str, bool]: query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}" result = self.big_query_client.query(query) if result: for value in result: if value['last_updated']: return value['last_updated'] else: # return none to allow all records to be pulled at the start return None else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'last_updated_data', 'table': self.table }) return result def get_number_of_records_to_import(self, last_updated) -> Union[int, bool]: result = self.my_sql_client.count_items_to_sync( table=self.table, watched_column=self.watched_column, last_run=last_updated) if self.my_sql_client.errors: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return result def query_mysql_for_records(self, last_updated_date: str, limit: int, offset: int) -> Union[list, bool]: results = self.my_sql_client.get_records( table=self.table, watched_column=self.watched_column, last_run=last_updated_date, limit=limit, offset=offset) if not results: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return results def load_mysql_data_into_data_frame( self, data: list, schema: dict) -> Union[DataFrame, bool]: if len(data[0]) is not len(schema.keys()): self.sd_logger.critical( { 'message': "Schema and data length mismatch", 'schema_length': len(schema.keys()), 'data_length': len(data[0]) }, { 'class': 'Data', 'method': 'load_mysql_data_into_data_frame', 'table': self.table }) return False df = pd.DataFrame.from_records(data, columns=schema.keys()) del data return df def transform_data_frame_to_match_big_query_schema( self, data_frame: DataFrame, schema: dict) -> Union[DataFrame, bool]: try: df = service_helpers.data_frame_to_schema(data_frame, schema) except ValueError as e: self.sd_logger.critical({'message': 'Error: {}'.format(e)}, { 'class': 'Data', 'method': 'transform_data_frame_to_match_big_query_schema', 'table': self.table }) return False return df def append_data_frame_to_big_query(self, data_frame: DataFrame): result = self.big_query_client.upload_data_frame(data_frame) if result: message = f"table:{self.table} | Records written: {data_frame.shape[0]}" self.sd_logger.info({'message': message}, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) return result def write_df_to_storage(self, df: DataFrame) -> bool: storage_client = StorageAdapter(self.service_account) storage_client.get_client() date_time_obj = datetime.datetime.utcnow() location = f'error/csv/{self.table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC' result = storage_client.write_string( bucket=self.definitions['service'] + '-etl', destination=location, string=df.to_csv(), encoding='text/csv') if not result: self.sd_logger.error( storage_client.errors, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return False self.sd_logger.info({'message': f"Failed CSV added to {location}"}, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return result