Exemplo n.º 1
0
def to_redshift(self,
                table_name,
                s3_bucket,
                s3_key,
                engine=None,
                schema=None,
                if_exists="fail",
                index=False,
                compress=True,
                primary_key=None,
                aws_access_key_id=None,
                aws_secret_access_key=None,
                **kwargs):

    if not engine:
        engine = generate_redshift_engine_string()

    if not aws_access_key_id:
        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    if not aws_secret_access_key:
        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

    # Get Pandas SQLTable object
    table = SQLTable(
        table_name,
        pandasSQL_builder(engine, schema=schema),
        self,
        if_exists=if_exists,
        schema=schema,
        index=index,
    )

    def quote(s):
        return '"' + str(s) + '"'

    # Full table name with schema
    if schema:
        full_table_name = quote(schema) + "." + quote(table_name)
    else:
        full_table_name = quote(table_name)

    # Check table
    if table.exists():
        if if_exists == "fail":
            raise ValueError("Table {} already exists.".format(table_name))
        elif if_exists == "append":
            queue = [
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                )
            ]
        elif if_exists == "replace":
            queue = [
                "drop table {};".format(full_table_name),
                table.sql_schema() + ";",
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                ),
            ]
        elif if_exists == "update":
            staging_table = "{}_staging".format(table_name)

            if not primary_key:
                raise ValueError(
                    "Expected a primary key to update existing table")

            queue = [
                "begin;",
                "drop table if exists {};".format(staging_table),
                "create temporary table {} (like {});".format(
                    staging_table, full_table_name),
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                ),
                "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});"
                .format(
                    full_table_name=full_table_name,
                    primary_key=primary_key,
                    staging_table=staging_table,
                ),
                "insert into {} (select * from {});".format(
                    full_table_name, staging_table),
                "end;",
            ]
        else:
            raise ValueError("{} is not valid for if_exists".format(if_exists))
    else:
        queue = [
            table.sql_schema() + ";",
            CopyCommand(
                to=table,
                data_location="s3://{}/{}".format(s3_bucket, s3_key),
                access_key_id=aws_access_key_id,
                secret_access_key=aws_secret_access_key,
                format="CSV",
                compression="GZIP" if compress else None,
            ),
        ]

    # Save DataFrame to S3
    self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress)

    # Execute queued statements
    engine = _engine_builder(engine)
    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)
Exemplo n.º 2
0
    def write(self,
              data_frame,
              routine_name,
              table_name,
              bucketname=None,
              if_exists='replace',
              sub_routine=None):
        """Write data table

        :param data_frame: dataframe
        :param routine_name: routine name
        :param table_name: table name
        :param bucketname: bucket name
        :param if_exists: method if exists
        :param sub_routine: sub routine
        :return: None
        """
        # todo this function is pretty verbose as it is, please use logger instead of print
        # todo make sure log statement is understandable for outside observer
        # todo bucketname should always be project_name, redshift should know its own project_name
        # todo when table is new, write metadata, but give an option to skip metadata

        self.bucket = bucketname
        if (table_name != 'meta_database') & (sub_routine is None):
            table_name = routine_name + '/' + table_name
        elif (table_name == 'meta_database') & (sub_routine is None):
            table_name = table_name
        else:
            table_name = routine_name + '/' + sub_routine + '/' + table_name
        print(table_name)
        logging.info('Writing table {} :'.format(table_name))

        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucketname)

        con = psycopg2.connect(self.redshift_path)
        con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = con.cursor()

        # write DF to string stream
        csv_buffer = StringIO()
        data_frame.to_csv(csv_buffer, index=None, header=None, sep='|')

        # reset stream position
        csv_buffer.seek(0)
        # create binary stream
        gz_buffer = BytesIO()

        # compress string stream using gzip
        with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
            gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))

        # write stream to S3
        timestamp = datetime.datetime.strftime(datetime.datetime.now(),
                                               '%Y%m%d%H%M%S')
        bucket.put_object(Key='tmp_' + timestamp + '.gz',
                          Body=gz_buffer.getvalue())
        print('saved file ')

        # CREATE THE COPY STATEMENT TO SEND FROM S3 TO THE TABLE IN REDSHIFT
        s3_path_tmp_file = 's3://{0}/{1}'.format(bucketname,
                                                 'tmp_' + timestamp + '.gz')

        print('create table')
        table = SQLTable(table_name,
                         pandasSQL_builder(self.engine, schema=None),
                         data_frame,
                         if_exists=if_exists,
                         index=None)

        statements = []
        if table.exists():
            if if_exists == 'fail':
                raise ValueError("Table Exists")
            elif if_exists == 'append':
                statements = []
            elif if_exists == 'replace':
                statements = [
                    """ truncate "{}"; rollback; drop table "{}";""".format(
                        table_name, table_name)
                ]
            else:
                raise ValueError("Bad option for `if_exists`")
        statements.append(table.sql_schema() + ';')

        statement = """
                copy "{0}"
                from '{1}'
                delimiter '{2}'
                region 'us-east-1'
                CREDENTIALS 'aws_access_key_id={3};aws_secret_access_key={4}'
                FORMAT AS CSV NULL AS '@NULL@'
                GZIP
                TRUNCATECOLUMNS
                """.format(table_name, s3_path_tmp_file, '|',
                           'AKIAIVCDQREXD2TPPRAQ',
                           'SCemMCgkq1rUruSrIDbFdjorHthnvY6E4j8/UEfg')
        statements.append(statement)

        try:
            logging.info('excucte statement')
            for stmt in statements:
                print(stmt)
                cur.execute(stmt)
                # con.commit()
            logging.info('finish execute')

        except Exception as e:
            print(e)
            traceback.print_exc(file=sys.stdout)
            con.rollback()
            raise

        s3.Object(bucketname, 'tmp_' + timestamp + '.gz').delete()
        logging.info('FILLING THE TABLE IN REDSHIFT')
        logging.info('\n--------------- write already -----------------')
Exemplo n.º 3
0
def to_redshift(self, table_name, engine, bucket, keypath=None,
                schema=None, if_exists='fail', index=True, index_label=None,
                aws_access_key_id=None, aws_secret_access_key=None,
                columns=None, null_as=None, emptyasnull=True):
    """
    Write a DataFrame to redshift via S3

    Parameters
    =========

    table_name : str. (unqualified) name in redshift
    engine : SQLA engine
    bucket : str; s3 bucket
    keypath : str; keypath in s3 (without bucket name)
    schema : redshift schema
    if_exits : str; {'fail', 'append', 'replace'}
    index : bool; include DataFrames index
    index_label : bool; label for the index
    aws_access_key_id / aws_secret_access_key : from ~/.boto by default
    columns : subset of columns to include
    null_as : treat these as null
    emptyasnull bool; whether '' is null
    """
    url = self.to_s3(keypath, engine, bucket=bucket, index=index,
                     index_label=index_label)
    qualname = resolve_qualname(table_name, schema)
    table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema),
                     self, if_exists=if_exists, index=index)
    if columns is None:
        columns = ''
    else:
        columns = '()'.format(','.join(columns))
    print("Creating table {}".format(qualname))

    if table.exists():
        if if_exists == 'fail':
            raise ValueError("Table Exists")
        elif if_exists == 'append':
            queue = []
        elif if_exists == 'replace':
            queue = ['drop table {}'.format(qualname), table.sql_schema()]
        else:
            raise ValueError("Bad option for `if_exists`")

    else:
        queue = [table.sql_schema()]

    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)

    s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)

    conn = psycopg2.connect(database=engine.url.database,
                            user=engine.url.username,
                            password=engine.url.password,
                            host=engine.url.host,
                            port=engine.url.port,
                            sslmode='require')
    cur = conn.cursor()
    if null_as is not None:
        null_as = "NULL AS '{}'".format(null_as)
    else:
        null_as = ''

    if emptyasnull:
        emptyasnull = "EMPTYASNULL"
    else:
        emptyasnull = ''

    full_keypath = 's3://' + url

    print("COPYing")
    stmt = ("copy {qualname} {columns} from '{keypath}' "
            "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' "
            "GZIP "
            "{null_as} "
            "{emptyasnull}"
            "CSV;".format(qualname=qualname,
                          columns=columns,
                          keypath=full_keypath,
                          key=s3conn.aws_access_key_id,
                          secret=s3conn.aws_secret_access_key,
                          null_as=null_as,
                          emptyasnull=emptyasnull))
    cur.execute(stmt)
    conn.commit()
    conn.close()
Exemplo n.º 4
0
def to_redshift(self, table_name, engine, bucket, keypath=None,
                schema=None, if_exists='fail', index=True, index_label=None,
                aws_access_key_id=None, aws_secret_access_key=None,
                columns=None, null_as=None, emptyasnull=True):
    """
    Write a DataFrame to redshift via S3
    Parameters
    =========
    table_name : str. (unqualified) name in redshift
    engine : SQLA engine
    bucket : str; s3 bucket
    keypath : str; keypath in s3 (without bucket name)
    schema : redshift schema
    if_exits : str; {'fail', 'append', 'replace'}
    index : bool; include DataFrames index
    index_label : bool; label for the index
    aws_access_key_id / aws_secret_access_key : from ~/.boto by default
    columns : subset of columns to include
    null_as : treat these as null
    emptyasnull bool; whether '' is null
    """
    url = self.to_s3(keypath, engine, bucket=bucket, index=index,
                     index_label=index_label)
    qualname = resolve_qualname(table_name, schema)
    table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema),
                     self, if_exists=if_exists, index=index)
    if columns is None:
        columns = ''
    else:
        columns = '()'.format(','.join(columns))
    print("Creating table {}".format(qualname))

    if table.exists():
        if if_exists == 'fail':
            raise ValueError("Table Exists")
        elif if_exists == 'append':
            queue = []
        elif if_exists == 'replace':
            queue = ['drop table {}'.format(qualname), table.sql_schema()]
        else:
            raise ValueError("Bad option for `if_exists`")

    else:
        queue = [table.sql_schema()]

    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)

    s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)

    conn = psycopg2.connect(database=engine.url.database,
                            user=engine.url.username,
                            password=engine.url.password,
                            host=engine.url.host,
                            port=engine.url.port,
                            sslmode='require')
    cur = conn.cursor()
    if null_as is not None:
        null_as = "NULL AS '{}'".format(null_as)
    else:
        null_as = ''

    if emptyasnull:
        emptyasnull = "EMPTYASNULL"
    else:
        emptyasnull = ''

    full_keypath = 's3://' + url

    print("COPYing")
    stmt = ("copy {qualname} {columns} from '{keypath}' "
            "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' "
            "GZIP "
            "{null_as} "
            "{emptyasnull}"
            "CSV;".format(qualname=qualname,
                          columns=columns,
                          keypath=full_keypath,
                          key=s3conn.aws_access_key_id,
                          secret=s3conn.aws_secret_access_key,
                          null_as=null_as,
                          emptyasnull=emptyasnull))
    cur.execute(stmt)
    conn.commit()
    conn.close()