def to_redshift(self, table_name, s3_bucket, s3_key, engine=None, schema=None, if_exists="fail", index=False, compress=True, primary_key=None, aws_access_key_id=None, aws_secret_access_key=None, **kwargs): if not engine: engine = generate_redshift_engine_string() if not aws_access_key_id: aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") if not aws_secret_access_key: aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") # Get Pandas SQLTable object table = SQLTable( table_name, pandasSQL_builder(engine, schema=schema), self, if_exists=if_exists, schema=schema, index=index, ) def quote(s): return '"' + str(s) + '"' # Full table name with schema if schema: full_table_name = quote(schema) + "." + quote(table_name) else: full_table_name = quote(table_name) # Check table if table.exists(): if if_exists == "fail": raise ValueError("Table {} already exists.".format(table_name)) elif if_exists == "append": queue = [ CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ) ] elif if_exists == "replace": queue = [ "drop table {};".format(full_table_name), table.sql_schema() + ";", CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), ] elif if_exists == "update": staging_table = "{}_staging".format(table_name) if not primary_key: raise ValueError( "Expected a primary key to update existing table") queue = [ "begin;", "drop table if exists {};".format(staging_table), "create temporary table {} (like {});".format( staging_table, full_table_name), CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});" .format( full_table_name=full_table_name, primary_key=primary_key, staging_table=staging_table, ), "insert into {} (select * from {});".format( full_table_name, staging_table), "end;", ] else: raise ValueError("{} is not valid for if_exists".format(if_exists)) else: queue = [ table.sql_schema() + ";", CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), ] # Save DataFrame to S3 self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress) # Execute queued statements engine = _engine_builder(engine) with engine.begin() as con: for stmt in queue: con.execute(stmt)
def write(self, data_frame, routine_name, table_name, bucketname=None, if_exists='replace', sub_routine=None): """Write data table :param data_frame: dataframe :param routine_name: routine name :param table_name: table name :param bucketname: bucket name :param if_exists: method if exists :param sub_routine: sub routine :return: None """ # todo this function is pretty verbose as it is, please use logger instead of print # todo make sure log statement is understandable for outside observer # todo bucketname should always be project_name, redshift should know its own project_name # todo when table is new, write metadata, but give an option to skip metadata self.bucket = bucketname if (table_name != 'meta_database') & (sub_routine is None): table_name = routine_name + '/' + table_name elif (table_name == 'meta_database') & (sub_routine is None): table_name = table_name else: table_name = routine_name + '/' + sub_routine + '/' + table_name print(table_name) logging.info('Writing table {} :'.format(table_name)) s3 = boto3.resource('s3') bucket = s3.Bucket(bucketname) con = psycopg2.connect(self.redshift_path) con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = con.cursor() # write DF to string stream csv_buffer = StringIO() data_frame.to_csv(csv_buffer, index=None, header=None, sep='|') # reset stream position csv_buffer.seek(0) # create binary stream gz_buffer = BytesIO() # compress string stream using gzip with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file: gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8')) # write stream to S3 timestamp = datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d%H%M%S') bucket.put_object(Key='tmp_' + timestamp + '.gz', Body=gz_buffer.getvalue()) print('saved file ') # CREATE THE COPY STATEMENT TO SEND FROM S3 TO THE TABLE IN REDSHIFT s3_path_tmp_file = 's3://{0}/{1}'.format(bucketname, 'tmp_' + timestamp + '.gz') print('create table') table = SQLTable(table_name, pandasSQL_builder(self.engine, schema=None), data_frame, if_exists=if_exists, index=None) statements = [] if table.exists(): if if_exists == 'fail': raise ValueError("Table Exists") elif if_exists == 'append': statements = [] elif if_exists == 'replace': statements = [ """ truncate "{}"; rollback; drop table "{}";""".format( table_name, table_name) ] else: raise ValueError("Bad option for `if_exists`") statements.append(table.sql_schema() + ';') statement = """ copy "{0}" from '{1}' delimiter '{2}' region 'us-east-1' CREDENTIALS 'aws_access_key_id={3};aws_secret_access_key={4}' FORMAT AS CSV NULL AS '@NULL@' GZIP TRUNCATECOLUMNS """.format(table_name, s3_path_tmp_file, '|', 'AKIAIVCDQREXD2TPPRAQ', 'SCemMCgkq1rUruSrIDbFdjorHthnvY6E4j8/UEfg') statements.append(statement) try: logging.info('excucte statement') for stmt in statements: print(stmt) cur.execute(stmt) # con.commit() logging.info('finish execute') except Exception as e: print(e) traceback.print_exc(file=sys.stdout) con.rollback() raise s3.Object(bucketname, 'tmp_' + timestamp + '.gz').delete() logging.info('FILLING THE TABLE IN REDSHIFT') logging.info('\n--------------- write already -----------------')
def to_redshift(self, table_name, engine, bucket, keypath=None, schema=None, if_exists='fail', index=True, index_label=None, aws_access_key_id=None, aws_secret_access_key=None, columns=None, null_as=None, emptyasnull=True): """ Write a DataFrame to redshift via S3 Parameters ========= table_name : str. (unqualified) name in redshift engine : SQLA engine bucket : str; s3 bucket keypath : str; keypath in s3 (without bucket name) schema : redshift schema if_exits : str; {'fail', 'append', 'replace'} index : bool; include DataFrames index index_label : bool; label for the index aws_access_key_id / aws_secret_access_key : from ~/.boto by default columns : subset of columns to include null_as : treat these as null emptyasnull bool; whether '' is null """ url = self.to_s3(keypath, engine, bucket=bucket, index=index, index_label=index_label) qualname = resolve_qualname(table_name, schema) table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema), self, if_exists=if_exists, index=index) if columns is None: columns = '' else: columns = '()'.format(','.join(columns)) print("Creating table {}".format(qualname)) if table.exists(): if if_exists == 'fail': raise ValueError("Table Exists") elif if_exists == 'append': queue = [] elif if_exists == 'replace': queue = ['drop table {}'.format(qualname), table.sql_schema()] else: raise ValueError("Bad option for `if_exists`") else: queue = [table.sql_schema()] with engine.begin() as con: for stmt in queue: con.execute(stmt) s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) conn = psycopg2.connect(database=engine.url.database, user=engine.url.username, password=engine.url.password, host=engine.url.host, port=engine.url.port, sslmode='require') cur = conn.cursor() if null_as is not None: null_as = "NULL AS '{}'".format(null_as) else: null_as = '' if emptyasnull: emptyasnull = "EMPTYASNULL" else: emptyasnull = '' full_keypath = 's3://' + url print("COPYing") stmt = ("copy {qualname} {columns} from '{keypath}' " "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' " "GZIP " "{null_as} " "{emptyasnull}" "CSV;".format(qualname=qualname, columns=columns, keypath=full_keypath, key=s3conn.aws_access_key_id, secret=s3conn.aws_secret_access_key, null_as=null_as, emptyasnull=emptyasnull)) cur.execute(stmt) conn.commit() conn.close()