def vertica_compliant_schema(self): """Transforms mysql table schema into a vertica compliant schema.""" if not self.table_schema: results = get_mysql_query_results( self.db_credentials, self.database, 'describe {}'.format(self.table_name)) for result in results: field_name = result[0].strip() field_type = result[1].strip() field_null = result[2].strip() types_with_parentheses = [ 'tinyint', 'smallint', 'int', 'bigint', 'datetime' ] if any(_type in field_type for _type in types_with_parentheses): field_type = field_type.rsplit('(')[0] elif field_type == 'longtext': field_type = 'LONG VARCHAR' elif field_type == 'longblob': field_type = 'LONG VARBINARY' elif field_type == 'double': field_type = 'DOUBLE PRECISION' if field_null == "NO": field_type = field_type + " NOT NULL" field_name = "\"{}\"".format(field_name) self.table_schema.append((field_name, field_type)) return self.table_schema
def get_snowflake_schema(self): """ Transforms MySQL table schema into a Snowflake-compliant schema. """ if not self.table_fields: results = get_mysql_query_results(self.db_credentials, self.database, 'describe {}'.format(self.table_name)) for result in results: field_name = result[0].strip() field_type = result[1].strip() field_null = result[2].strip() if self.should_exclude_field(self.table_name, field_name): self.deleted_fields.append(field_name) else: # Enclose any Snowflake-reserved keyword field names within double-quotes. if field_name.upper() in SNOWFLAKE_RESERVED_KEYWORDS: field_name = '"{}"'.format(field_name.upper()) mysql_types_with_parentheses = ['smallint', 'int', 'bigint', 'datetime', 'varchar'] if field_type == 'tinyint(1)': field_type = 'BOOLEAN' elif any(_type in field_type for _type in mysql_types_with_parentheses): field_type = field_type.rsplit('(')[0] elif field_type == 'longtext': field_type = 'VARCHAR' elif field_type == 'longblob': field_type = 'BINARY' if field_null == 'NO': field_type += ' NOT NULL' self.table_fields.append((field_name, field_type)) return self.table_fields
def requires(self): """ Determines the required tasks given the non-excluded tables in the MySQL schema. """ if not self.table_list: # Compute the list of required MySQL tables to import, excluding any excluded tables. results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') unfiltered_table_list = [result[0].strip() for result in results] self.table_list = [table_name for table_name in unfiltered_table_list if not self.should_exclude_table(table_name)] if self.required_tasks is None: self.required_tasks = [] for table_name in self.table_list: self.required_tasks.append( LoadMysqlToSnowflakeTableTask( db_credentials=self.db_credentials, sf_database=self.sf_database, schema=self.schema, scratch_schema=self.scratch_schema, run_id=self.run_id, warehouse=self.warehouse, role=self.role, warehouse_path=self.warehouse_path, warehouse_subdirectory=self.warehouse_subdirectory, database=self.database, table_name=table_name, overwrite=self.overwrite, date=self.date, credentials=self.credentials, exclude_field=self.exclude_field, ) ) return self.required_tasks
def vertica_compliant_schema(self): """Transforms mysql table schema into a vertica compliant schema.""" if not self.table_schema: results = get_mysql_query_results(self.db_credentials, self.database, 'describe {}'.format(self.table_name)) for result in results: field_name = result[0].strip() field_type = result[1].strip() field_null = result[2].strip() types_with_parentheses = ['tinyint', 'smallint', 'int', 'bigint', 'datetime'] if any(_type in field_type for _type in types_with_parentheses): field_type = field_type.rsplit('(')[0] elif field_type == 'longtext': field_type = 'LONG VARCHAR' elif field_type == 'double': field_type = 'DOUBLE PRECISION' if field_null == "NO": field_type = field_type + " NOT NULL" field_name = "\"{}\"".format(field_name) self.table_schema.append((field_name, field_type)) return self.table_schema
def requires(self): if not self.table_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') unfiltered_table_list = [result[0].strip() for result in results] self.table_list = [table_name for table_name in unfiltered_table_list if not self.should_exclude_table(table_name)] if self.required_tasks is None: self.required_tasks = [] for table_name in self.table_list: self.required_tasks.append( LoadMysqlToBigQueryTableTask( db_credentials=self.db_credentials, database=self.database, warehouse_path=self.warehouse_path, warehouse_subdirectory=self.warehouse_subdirectory, table_name=table_name, overwrite=self.overwrite, date=self.date, dataset_id=self.dataset_id, credentials=self.credentials, max_bad_records=self.max_bad_records, skip_clear_marker=self.overwrite, exclude_field=self.exclude_field, ) ) return self.required_tasks
def requires(self): if not self.table_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') self.table_list = [result[0].strip() for result in results] pre_import_task = PreImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite) yield pre_import_task for table_name in self.table_list: if not self.should_exclude_table(table_name): yield LoadMysqlToVerticaTableTask( credentials=self.credentials, schema=pre_import_task.schema_loading, db_credentials=self.db_credentials, database=self.database, warehouse_path=self.warehouse_path, table_name=table_name, overwrite=self.overwrite, date=self.date, marker_schema=self.marker_schema, ) yield PostImportDatabaseTask(date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite)
def requires(self): if not self.table_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') unfiltered_table_list = [result[0].strip() for result in results] self.table_list = [table_name for table_name in unfiltered_table_list if not self.should_exclude_table(table_name)] if self.required_tasks is None: self.required_tasks = [] for table_name in self.table_list: self.required_tasks.append( LoadMysqlToBigQueryTableTask( db_credentials=self.db_credentials, database=self.database, warehouse_path=self.warehouse_path, warehouse_subdirectory=self.warehouse_subdirectory, table_name=table_name, overwrite=self.overwrite, date=self.date, dataset_id=self.dataset_id, credentials=self.credentials, max_bad_records=self.max_bad_records, skip_clear_marker=self.overwrite, exclude_field=self.exclude_field, ) ) return self.required_tasks
def requires(self): if not self.table_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') self.table_list = [result[0].strip() for result in results] pre_import_task = PreImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite ) yield pre_import_task for table_name in self.table_list: if not self.should_exclude_table(table_name): yield LoadMysqlToVerticaTableTask( credentials=self.credentials, schema=pre_import_task.schema_loading, db_credentials=self.db_credentials, database=self.database, warehouse_path=self.warehouse_path, table_name=table_name, overwrite=self.overwrite, date=self.date, marker_schema=self.marker_schema, ) yield PostImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite )
def get_bigquery_schema(self): """Transforms mysql table schema into a vertica compliant schema.""" if not self.table_schema: results = get_mysql_query_results( self.db_credentials, self.database, 'describe {}'.format(self.table_name)) for result in results: field_name = result[0].strip() field_type = result[1].strip() field_null = result[2].strip() # Strip off size information from any type except booleans. if field_type != 'tinyint(1)': field_type = field_type.rsplit('(')[0] bigquery_type = MYSQL_TO_BIGQUERY_TYPE_MAP.get(field_type) mode = 'REQUIRED' if field_null == 'NO' else 'NULLABLE' description = '' if self.should_exclude_field(field_name): self.deleted_fields.append(field_name) else: self.table_schema.append( SchemaField(field_name, bigquery_type, description=description, mode=mode)) return self.table_schema
def requires(self): if self.creation_time is None: self.creation_time = datetime.datetime.utcnow().isoformat() if self.required_tasks is None: self.required_tasks = [] if not self.table_includes_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') table_list = [result[0].strip() for result in results] self.table_includes_list = [ table_name for table_name in table_list if not self.should_exclude_table(table_name) ] for table_name in self.table_includes_list: self.required_tasks.append( ExportMysqlTableToS3Task( date=self.date, database=self.database, db_credentials=self.db_credentials, exclude_field=self.exclude_field, table_name=table_name, )) return self.required_tasks
def rows(self): query_result = get_mysql_query_results(credentials=self.credentials, database=self.database, query=self.insert_query) log.info('query_sql = [{}]'.format(self.insert_query)) for row in query_result: yield row
def mysql_compliant_schema(self): if not self.mysql_table_schema: results = get_mysql_query_results(self.db_credentials, self.database, 'describe {}'.format(self.table_name)) for result in results: field_name = result[0].strip() field_type = result[1].strip() field_null = result[2].strip() if self.should_exclude_field(self.table_name, field_name): self.deleted_fields.append(field_name) else: self.mysql_table_schema.append((field_name, field_type, field_null)) return self.mysql_table_schema
def run(self): # Add yields of tasks in run() method, to serve as dynamic dependencies. # This method should be rerun each time it yields a job. if not self.table_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') self.table_list = [result[0].strip() for result in results] pre_import_task = PreImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, ) yield pre_import_task table_white_list = [] for table_name in self.table_list: if not self.should_exclude_table(table_name): table_white_list.append(table_name) yield LoadMysqlToVerticaTableTask( credentials=self.credentials, schema=pre_import_task.schema_loading, db_credentials=self.db_credentials, database=self.database, warehouse_path=self.warehouse_path, warehouse_subdirectory=self.warehouse_subdirectory, table_name=table_name, overwrite=self.overwrite, date=self.date, marker_schema=self.marker_schema, exclude_field=self.exclude_field, ) yield PostImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, tables=table_white_list ) self.is_complete = True
def run(self): # Add yields of tasks in run() method, to serve as dynamic dependencies. # This method should be rerun each time it yields a job. if not self.table_list: results = get_mysql_query_results(self.db_credentials, self.database, 'show tables') self.table_list = [result[0].strip() for result in results] pre_import_task = PreImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, ) yield pre_import_task table_white_list = [] for table_name in self.table_list: if not self.should_exclude_table(table_name): table_white_list.append(table_name) yield LoadMysqlToVerticaTableTask( credentials=self.credentials, schema=pre_import_task.schema_loading, db_credentials=self.db_credentials, database=self.database, warehouse_path=self.warehouse_path, warehouse_subdirectory=self.warehouse_subdirectory, table_name=table_name, overwrite=self.overwrite, date=self.date, marker_schema=self.marker_schema, exclude_field=self.exclude_field, ) yield PostImportDatabaseTask( date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, tables=table_white_list ) self.is_complete = True
def get_bigquery_schema(self): """Transforms mysql table schema into a vertica compliant schema.""" if not self.table_schema: results = get_mysql_query_results(self.db_credentials, self.database, 'describe {}'.format(self.table_name)) for result in results: field_name = result[0].strip() field_type = result[1].strip() field_null = result[2].strip() # Strip off size information from any type except booleans. if field_type != 'tinyint(1)': field_type = field_type.rsplit('(')[0] bigquery_type = MYSQL_TO_BIGQUERY_TYPE_MAP.get(field_type) mode = 'REQUIRED' if field_null == 'NO' else 'NULLABLE' description = '' if self.should_exclude_field(self.table_name, field_name): self.deleted_fields.append(field_name) else: self.table_schema.append(SchemaField(field_name, bigquery_type, description=description, mode=mode)) return self.table_schema
def load_data(self): log.info('query_sql = [{}]'.format(self.query)) query_result = get_mysql_query_results(credentials=self.credentials, database=self.database, query=self.query) return query_result