def copy_s3_to_staging(self, table_name): ''' Takes connection parameters and a table name. Copies that file in s3 to a staging table in redshift. ''' redshift_params = self.con_params ############################################## ############################################## #I should point out that the databse name and #IAM role was hardcoded here. That's garbage. ############################################## ############################################## with open_redshift_db_connection(redshift_params) as c: try: command = """COPY """ + table_name + """_staging FROM 's3://ichain-sync-machine/""" + 'DEAHINPHSDB' + '/' + table_name + '/' + table_name + """-changes.csv' IAM_ROLE 'arn:aws:iam::265991248033:role/myRedShiftforKin' delimiter ',' IGNOREHEADER 1 removequotes emptyasnull blanksasnull maxerror 1 COMPUPDATE OFF STATUPDATE OFF; """ #print command c.execute(command) except Exception as e: #print (e) return False return True
def init_staging_tables(self): ''' this is overriding all column datatypes... TODO: I should really be querying the source db and finding out the data types for each column instead of just setting them the way I am now. TODO:I should also be looking at the staging table to make sure that there are no more rows left to be delserted into produciton. ''' for table in self.src_db.tables_with_pks: columns_info_list = self.src_db.get_column_names(table) #Forgive me for I have hardcoded... cols_and_types = '__$start_lsn varchar(255), __$operation varchar(255), __$update_mask varchar(255)' #This is dumb, I'm overriding datatypes for all columns... for row in columns_info_list: cols_and_types += ',\n' + row[3] + ' ' + 'varchar(255)' pk = self.src_db.get_table_primary_key(table) if not pk: pk = '' command = """CREATE TABLE """ + table + """_staging( """ + cols_and_types + """, PRIMARY KEY (""" + str(pk) + """)\n);""" print command with open_redshift_db_connection(redshift_params) as c: c.execute(command) return True
def redshift_delsert(self, table, pk, column_name_list): ''' TODO: Design has changed, Instead of retrieving column name list and primary key from config folder in s3, I'm now quering that info directly from the source database. That means that I now need to make space in the redshift instantiation for this. ''' con_params = self.con_params with open_redshift_db_connection(con_params) as c: #Delete rows that were deleted or updated command = '''DELETE FROM ''' + table + ''' USING ''' + table + '''_staging a WHERE ''' + table + '''.''' + pk + '''=a.''' + pk + ''' AND (__$operation LIKE '%4%' OR __$operation LIKE '%1%');''' c.execute(command) ## Insert rows to redshift prod that were inserted or updated in src res = column_name_list[0].lower() for i in column_name_list[1:]: res += ',' + i.lower() command = '''INSERT INTO ''' + table.upper( ) + ''' (''' + res + ''') SELECT ''' + res + ''' FROM ''' + table.upper() + '''_staging s WHERE __$operation LIKE '%4%' OR __$operation LIKE '%2';''' #print command c.execute(command) return True
def test2(redshift_params): res = 'yis' with open_redshift_db_connection(redshift_params) as c: c.execute("""SELECT * FROM DEAA0_staging;""") if c.fetchall(): res = c.fetchall() print res return res
def create_staging_table(redshift_params): ''' ''' with open_redshift_db_connection(redshift_params) as c: c.execute("CREATE TABLE sometable") rows = c.fetchall() for i in rows: print i return
def redshift_test(self, rs_params): ''' This is a simple test of the redshift db. Prints 10 rows from a toytable in the redshift dev database. ''' with open_redshift_db_connection(rs_params) as c: c.execute("SELECT * FROM toytable LIMIT 10;") rows = c.fetchall() for i in rows: print i return True
def truncate_redshift_staging_table(self, table_name): ''' This deletes the rows in a STAGING table without deleting the columnn names. ''' con_params = self.con_params command = ("""Truncate """ + table_name + """_staging;""") with open_redshift_db_connection(con_params) as c: try: c.execute(command) except Exception as e: print(e) return False return True
def insert_rs_rows(con_params, table_name): ''' This will insert rows from the staging table in redshift to the proper table command = """ DELETE FROM """++""" USING """++""""s"""+ AND (__$operation = '3' OR __$operation = '4'); AND (__$operation = '3' OR __$operation = '4'); ''' with open_redshift_db_connection(redshift_params) as c: try: c.execute(command) except Exception as e: print(e) return c.fetchall()
def test(redshift_params): ''' This is a simple test of the redshift db. Prints 10 rows from a toytable in the redshift dev database. ''' df = object cols = object with open_redshift_db_connection(redshift_params) as c: c.execute("SELECT * FROM toytable LIMIT 10;") print c.fetchall() ''' df = DataFrame(c.fetchall()) cols = [col[0] for col in c.description] df = df.to_csv(sep = ',') ''' return df, cols
def test3(redshift_params): res = 'yis' print res with open_redshift_db_connection(redshift_params) as c: c.execute("""SELECT DISTINCT tablename FROM PG_TABLE_DEF WHERE schemaname = 'public';""") if c.fetchall(): res = c.fetchall() for i in res: print i print i print res print "okay" return res
def del_rs_rows(con_params, table_name): ''' This will delete rows from the staging table in redshift to the proper table command = """ DELETE FROM """++""" USING """++""""s"""+ """ Where"""++""" """" AND (__$operation = '3' OR __$operation = '4'); """ ''' with open_redshift_db_connection(redshift_params) as c: try: c.execute(command) except Exception as e: print(e) return
def init_rs_dummy_tables(self): ''' This is just making an empty table in redshift to test the delsert command since I haven't populated the redshift db with data yet. ''' for table in self.src_db.tables_with_pks: columns_info_list = self.src_db.get_column_names(table) cols_and_types = '' cols_and_types += columns_info_list[0][3] + ' ' + 'varchar(255)' for row in columns_info_list[1:]: cols_and_types += ',\n' + row[3] + ' ' + 'varchar(255)' pk = self.src_db.get_table_primary_key(table) if not pk: pk = '' command = """CREATE TABLE """ + table.lower() + """( """ + cols_and_types + """, PRIMARY KEY (""" + str(pk) + """)\n);""" print command with open_redshift_db_connection(redshift_params) as c: c.execute(command) return True
def vacuum_redshift_db(self, table=None): ''' From AWS documentation: 'Reclaims space and resorts rows in either a specified table or all tables in the current database.' SEE: https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html Default is to vacuum ALL tables. If you supply this method with a table name, only that table will be vacuumed ''' con_params = self.con_params if table: command = 'vacuum ' + table + ';' else: command = 'vacuum;' with open_redshift_db_connection(con_params) as c: try: c.execute(command) except Exception as e: print(e) return False return True
def analyze_redshift_db(self, table=None): ''' From AWS documentation: 'Updates table statistics for use by the query planner.' SEE: https://docs.aws.amazon.com/redshift/latest/dg/r_ANALYZE.html Default is to analyze ALL tables. If you supply this method with a table name, only that table will be analyzed. ''' con_params = self.con_params if table: command = 'analyze' + table + ';' else: command = 'analyze verbose;' with open_redshift_db_connection(con_params) as c: try: c.execute(command) except Exception as e: print(e) return False return True