Exemplo n.º 1
0
 def _content_table(self):
     if self._content is not None:
         sql_table = SQLTable(
             name=self.schema_table_name,
             pandas_sql_engine=self._sql_engine,
             frame=self._content,
             if_exists="replace",
             index=False,
             schema=DATA_FRAME_CONTENT_SCHEMA,
         )
         index_col = Column(DATA_FRAME_CONTENT_INDEX_HEADER,
                            INTEGER,
                            primary_key=True,
                            autoincrement=True)
         sql_table.table.append_column(index_col)
         metadata_col = Column(METADATA_HEADER,
                               JSONB,
                               nullable=False,
                               server_default="{}")
         sql_table.table.append_column(metadata_col)
     else:
         sql_table = SQLTable(
             name=self.schema_table_name,
             pandas_sql_engine=self._sql_engine,
             if_exists="replace",
             index=True,
             index_label=DATA_FRAME_CONTENT_INDEX_HEADER,
             schema=DATA_FRAME_CONTENT_SCHEMA,
         )
     return sql_table
Exemplo n.º 2
0
def to_sql(df, name, schema, con, index, if_exists, mode='default', **kwargs):
    """
    Override the default `pandas.to_sql` method to allow for insertion of
    multiple rows of data at once. This is derived from the upstream patch at
    https://github.com/pandas-dev/pandas/pull/21401, and can be deprecated
    once it is merged and released in a new version of `pandas`.
    """
    assert mode in ('default',
                    'multi'), 'unexpected `to_sql` mode {}'.format(mode)
    if mode == 'default':
        return df.to_sql(name=name,
                         schema=schema,
                         con=con,
                         index=index,
                         if_exists=if_exists,
                         **kwargs)
    else:
        nrows = len(df)
        if nrows == 0:
            return

        chunksize = kwargs.get('chunksize', nrows)
        if chunksize == 0:
            raise ValueError('chunksize argument should be non-zero')
        chunks = int(nrows / chunksize) + 1

        pd_sql = SQLDatabase(con)
        pd_table = SQLTable(name,
                            pd_sql,
                            frame=df,
                            index=index,
                            if_exists=if_exists,
                            index_label=kwargs.get('insert_label'),
                            schema=schema,
                            dtype=kwargs.get('dtype'))
        pd_table.create()
        keys, data_list = pd_table.insert_data()

        with pd_sql.run_transaction() as conn:
            for i in range(chunks):
                start_i = i * chunksize
                end_i = min((i + 1) * chunksize, nrows)
                if start_i >= end_i:
                    break

                chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
                data = [{k: v
                         for k, v in zip(keys, row)} for row in chunk_iter]
                conn.execute(pd_table.table.insert(data))  # multivalues insert
Exemplo n.º 3
0
 def gen_sql_table(self, df):
     from pandas.io.sql import SQLTable
     from sqlalchemy import Column, DateTime
     self.table = SQLTable(self.table_name,
                           self.pd_sql,
                           df,
                           index=False,
                           schema=self.schema).table.tometadata(
                               self.pd_sql.meta)
     if self.update_timestamp_field and self.update_timestamp_field not in self.table.columns:
         self.table.append_column(
             Column(self.update_timestamp_field, DateTime))
     if self.insert_timestamp_field and self.insert_timestamp_field not in self.table.columns:
         self.table.append_column(
             Column(self.insert_timestamp_field, DateTime))
Exemplo n.º 4
0
def _create_table(schema: str, table_name: str, creds: SqlCreds,
                  df: pd.DataFrame, if_exists: str):
    """use pandas' own code to create the table and schema"""

    sql_db = SQLDatabase(engine=creds.engine, schema=schema)
    table = SQLTable(
        table_name,
        sql_db,
        frame=df,
        index=False,  # already set as new col earlier if index=True
        if_exists=if_exists,
        index_label=None,
        schema=schema,
        dtype=None,
    )
    table.create()
Exemplo n.º 5
0
    def to_sql_set_primary_key_and_not_null(self,
                                            frame,
                                            name,
                                            con,
                                            keys,
                                            sql_table,
                                            schema=None,
                                            if_exists='fail',
                                            index=True,
                                            index_label=None,
                                            chunksize=None,
                                            dtype=None):
        # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L437
        if if_exists not in ('fail', 'replace', 'append'):
            raise ValueError(
                "'{0}' is not valid for if_exists".format(if_exists))

        # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L508
        pandas_sql = SQLDatabase(con, schema=schema)

        if isinstance(frame, pd.Series):
            frame = frame.to_frame()
        elif not isinstance(frame, pd.DataFrame):
            raise NotImplementedError(
                "'frame' argument should be either a Series or a DataFrame")

        if dtype is not None:
            from sqlalchemy.types import to_instance, TypeEngine
            for col, my_type in dtype.items():
                if not isinstance(to_instance(my_type), TypeEngine):
                    raise ValueError(
                        'The type of {} is not a SQLAlchemy type '.format(col))

        table = SQLTable(name,
                         pandas_sql,
                         frame=frame,
                         index=index,
                         if_exists=if_exists,
                         index_label=index_label,
                         schema=schema,
                         keys=keys,
                         dtype=dtype)
        table.table = sql_table
        table.create()
        table.insert(chunksize)
Exemplo n.º 6
0
def get_sa_table_for_dataframe(dataframe, tablename, schemaname):
    sa_engine = get_engine()
    # get max lengths for strings and use it to set dtypes
    dtypes = {}
    object_types = get_dataframe_column_object_types(dataframe)

    for c in object_types:
        if dataframe[c].dtype == np.dtype('O'):
            n = dataframe[c].map(lambda c: len(str(c)) if c else None).max()
            # we use 10 times the max length or varchar(max)
            dtypes[c] = VARCHAR(min([n * 10, 65535]))

    table = SQLTable(tablename,
                     pandasSQL_builder(sa_engine, schema=schemaname),
                     dataframe,
                     if_exists=True,
                     index=False,
                     dtype=dtypes)

    return table
Exemplo n.º 7
0
def to_redshift(self,
                table_name,
                s3_bucket,
                s3_key,
                engine=None,
                schema=None,
                if_exists="fail",
                index=False,
                compress=True,
                primary_key=None,
                aws_access_key_id=None,
                aws_secret_access_key=None,
                **kwargs):

    if not engine:
        engine = generate_redshift_engine_string()

    if not aws_access_key_id:
        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    if not aws_secret_access_key:
        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

    # Get Pandas SQLTable object
    table = SQLTable(
        table_name,
        pandasSQL_builder(engine, schema=schema),
        self,
        if_exists=if_exists,
        schema=schema,
        index=index,
    )

    def quote(s):
        return '"' + str(s) + '"'

    # Full table name with schema
    if schema:
        full_table_name = quote(schema) + "." + quote(table_name)
    else:
        full_table_name = quote(table_name)

    # Check table
    if table.exists():
        if if_exists == "fail":
            raise ValueError("Table {} already exists.".format(table_name))
        elif if_exists == "append":
            queue = [
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                )
            ]
        elif if_exists == "replace":
            queue = [
                "drop table {};".format(full_table_name),
                table.sql_schema() + ";",
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                ),
            ]
        elif if_exists == "update":
            staging_table = "{}_staging".format(table_name)

            if not primary_key:
                raise ValueError(
                    "Expected a primary key to update existing table")

            queue = [
                "begin;",
                "drop table if exists {};".format(staging_table),
                "create temporary table {} (like {});".format(
                    staging_table, full_table_name),
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                ),
                "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});"
                .format(
                    full_table_name=full_table_name,
                    primary_key=primary_key,
                    staging_table=staging_table,
                ),
                "insert into {} (select * from {});".format(
                    full_table_name, staging_table),
                "end;",
            ]
        else:
            raise ValueError("{} is not valid for if_exists".format(if_exists))
    else:
        queue = [
            table.sql_schema() + ";",
            CopyCommand(
                to=table,
                data_location="s3://{}/{}".format(s3_bucket, s3_key),
                access_key_id=aws_access_key_id,
                secret_access_key=aws_secret_access_key,
                format="CSV",
                compression="GZIP" if compress else None,
            ),
        ]

    # Save DataFrame to S3
    self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress)

    # Execute queued statements
    engine = _engine_builder(engine)
    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)
Exemplo n.º 8
0
def to_redshift(self, table_name, engine, bucket, keypath=None,
                schema=None, if_exists='fail', index=True, index_label=None,
                aws_access_key_id=None, aws_secret_access_key=None,
                columns=None, null_as=None, emptyasnull=True):
    """
    Write a DataFrame to redshift via S3

    Parameters
    =========

    table_name : str. (unqualified) name in redshift
    engine : SQLA engine
    bucket : str; s3 bucket
    keypath : str; keypath in s3 (without bucket name)
    schema : redshift schema
    if_exits : str; {'fail', 'append', 'replace'}
    index : bool; include DataFrames index
    index_label : bool; label for the index
    aws_access_key_id / aws_secret_access_key : from ~/.boto by default
    columns : subset of columns to include
    null_as : treat these as null
    emptyasnull bool; whether '' is null
    """
    url = self.to_s3(keypath, engine, bucket=bucket, index=index,
                     index_label=index_label)
    qualname = resolve_qualname(table_name, schema)
    table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema),
                     self, if_exists=if_exists, index=index)
    if columns is None:
        columns = ''
    else:
        columns = '()'.format(','.join(columns))
    print("Creating table {}".format(qualname))

    if table.exists():
        if if_exists == 'fail':
            raise ValueError("Table Exists")
        elif if_exists == 'append':
            queue = []
        elif if_exists == 'replace':
            queue = ['drop table {}'.format(qualname), table.sql_schema()]
        else:
            raise ValueError("Bad option for `if_exists`")

    else:
        queue = [table.sql_schema()]

    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)

    s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)

    conn = psycopg2.connect(database=engine.url.database,
                            user=engine.url.username,
                            password=engine.url.password,
                            host=engine.url.host,
                            port=engine.url.port,
                            sslmode='require')
    cur = conn.cursor()
    if null_as is not None:
        null_as = "NULL AS '{}'".format(null_as)
    else:
        null_as = ''

    if emptyasnull:
        emptyasnull = "EMPTYASNULL"
    else:
        emptyasnull = ''

    full_keypath = 's3://' + url

    print("COPYing")
    stmt = ("copy {qualname} {columns} from '{keypath}' "
            "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' "
            "GZIP "
            "{null_as} "
            "{emptyasnull}"
            "CSV;".format(qualname=qualname,
                          columns=columns,
                          keypath=full_keypath,
                          key=s3conn.aws_access_key_id,
                          secret=s3conn.aws_secret_access_key,
                          null_as=null_as,
                          emptyasnull=emptyasnull))
    cur.execute(stmt)
    conn.commit()
    conn.close()
Exemplo n.º 9
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    pandas_sql_engine = pandasSQL_builder(engine)
    table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
Exemplo n.º 10
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    table = SQLTable(name, engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
Exemplo n.º 11
0
    def write(self,
              data_frame,
              routine_name,
              table_name,
              bucketname=None,
              if_exists='replace',
              sub_routine=None):
        """Write data table

        :param data_frame: dataframe
        :param routine_name: routine name
        :param table_name: table name
        :param bucketname: bucket name
        :param if_exists: method if exists
        :param sub_routine: sub routine
        :return: None
        """
        # todo this function is pretty verbose as it is, please use logger instead of print
        # todo make sure log statement is understandable for outside observer
        # todo bucketname should always be project_name, redshift should know its own project_name
        # todo when table is new, write metadata, but give an option to skip metadata

        self.bucket = bucketname
        if (table_name != 'meta_database') & (sub_routine is None):
            table_name = routine_name + '/' + table_name
        elif (table_name == 'meta_database') & (sub_routine is None):
            table_name = table_name
        else:
            table_name = routine_name + '/' + sub_routine + '/' + table_name
        print(table_name)
        logging.info('Writing table {} :'.format(table_name))

        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucketname)

        con = psycopg2.connect(self.redshift_path)
        con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = con.cursor()

        # write DF to string stream
        csv_buffer = StringIO()
        data_frame.to_csv(csv_buffer, index=None, header=None, sep='|')

        # reset stream position
        csv_buffer.seek(0)
        # create binary stream
        gz_buffer = BytesIO()

        # compress string stream using gzip
        with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
            gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))

        # write stream to S3
        timestamp = datetime.datetime.strftime(datetime.datetime.now(),
                                               '%Y%m%d%H%M%S')
        bucket.put_object(Key='tmp_' + timestamp + '.gz',
                          Body=gz_buffer.getvalue())
        print('saved file ')

        # CREATE THE COPY STATEMENT TO SEND FROM S3 TO THE TABLE IN REDSHIFT
        s3_path_tmp_file = 's3://{0}/{1}'.format(bucketname,
                                                 'tmp_' + timestamp + '.gz')

        print('create table')
        table = SQLTable(table_name,
                         pandasSQL_builder(self.engine, schema=None),
                         data_frame,
                         if_exists=if_exists,
                         index=None)

        statements = []
        if table.exists():
            if if_exists == 'fail':
                raise ValueError("Table Exists")
            elif if_exists == 'append':
                statements = []
            elif if_exists == 'replace':
                statements = [
                    """ truncate "{}"; rollback; drop table "{}";""".format(
                        table_name, table_name)
                ]
            else:
                raise ValueError("Bad option for `if_exists`")
        statements.append(table.sql_schema() + ';')

        statement = """
                copy "{0}"
                from '{1}'
                delimiter '{2}'
                region 'us-east-1'
                CREDENTIALS 'aws_access_key_id={3};aws_secret_access_key={4}'
                FORMAT AS CSV NULL AS '@NULL@'
                GZIP
                TRUNCATECOLUMNS
                """.format(table_name, s3_path_tmp_file, '|',
                           'AKIAIVCDQREXD2TPPRAQ',
                           'SCemMCgkq1rUruSrIDbFdjorHthnvY6E4j8/UEfg')
        statements.append(statement)

        try:
            logging.info('excucte statement')
            for stmt in statements:
                print(stmt)
                cur.execute(stmt)
                # con.commit()
            logging.info('finish execute')

        except Exception as e:
            print(e)
            traceback.print_exc(file=sys.stdout)
            con.rollback()
            raise

        s3.Object(bucketname, 'tmp_' + timestamp + '.gz').delete()
        logging.info('FILLING THE TABLE IN REDSHIFT')
        logging.info('\n--------------- write already -----------------')
Exemplo n.º 12
0
def read_sql_table(engine,
                   table_name,
                   index_col=None,
                   columns=None,
                   select_from=None,
                   limit=None,
                   order_by=None,
                   where=None,
                   coerce_types=None,
                   raise_on_missing=True):
    """ Load a table from a SQL database.
    
    Parameters
    ----------
    engine : SQLAlchemy engine
        The SQL database to load from.
    
    table_name : str
        The name of the table to load.
    
    index_col : str, optional
        Column name to use as index for the returned data frame.
    
    columns : sequence of str, optional
        Columns to select from the table. By default, all columns are selected.

    select_from : str or SQLAlchemy clause, optional
        A FROM clause to use for the select statement. Defaults to the
        table name.
    
    limit : int, optional
        Limit the number of rows selected.
    
    order_by : str or SQLAlchemy clause, optional
        An ORDER BY clause to sort the selected rows.
    
    where : str or SQLAlchemy clause, optional
        A WHERE clause used to filter the selected rows.
    
    coerce_types : dict(str : dtype or Python type), optional
        Override pandas type inference for specific columns.
    
    Returns
    -------
    A pandas DataFrame.
    """
    # Pandas does not expose many of these options, so we pull out some of
    # Pandas' internals.
    #
    # An alternative approach would be to use `pandas.read_sql_query` with an
    # appropriate (dialect-specific) query. However, this approach would not
    # utilize Pandas' logic for column type inference (performed by
    # `_harmonize_columns()` below), and would hence produce inferior results.

    from sqlalchemy.schema import MetaData
    from pandas.io.sql import SQLDatabase, SQLTable

    # From pandas.io.sql.read_sql_table
    # and  pandas.io.sql.SQLDatabase.read_table:
    meta = MetaData(engine)
    try:
        meta.reflect(only=[table_name])
    except sqlalchemy.exc.InvalidRequestError:
        if raise_on_missing:
            raise ValueError("Table %s not found" % table_name)
        else:
            return None

    pd_db = SQLDatabase(engine, meta=meta)
    pd_tbl = SQLTable(table_name, pd_db, index=None)

    # Adapted from pandas.io.SQLTable.read:
    if columns is not None and len(columns) > 0:
        if index_col is not None and index_col not in columns:
            columns = [index_col] + columns

        cols = [pd_tbl.table.c[n] for n in columns]
    else:
        cols = pd_tbl.table.c

    if pd_tbl.index is not None:
        [cols.insert(0, pd_tbl.table.c[idx]) for idx in pd_tbl.index[::-1]]

    # Strip the table name from each of the column names to allow for more
    # general FROM clauses.
    sql_select = sqlalchemy.select([
        sqlalchemy.column(str(c).replace('{}.'.format(table_name), '', 1))
        for c in cols
    ])

    if select_from is not None:
        sql_select = sql_select.select_from(select_from)
    else:
        sql_select = sql_select.select_from(sqlalchemy.table(table_name))

    if where is not None:
        if isinstance(where, basestring):
            where = sqlalchemy.text(where)
        sql_select = sql_select.where(where)
    if limit is not None:
        sql_select = sql_select.limit(limit)
    if order_by is not None:
        if isinstance(order_by, basestring):
            order_by = sqlalchemy.sql.column(order_by)
        sql_select = sql_select.order_by(order_by)

    result = pd_db.execute(sql_select)
    data = result.fetchall()
    column_names = result.keys()

    pd_tbl.frame = pandas.DataFrame.from_records(data,
                                                 index=index_col,
                                                 columns=column_names)

    # This line has caused issues with incorrect type inference -- add it
    # back with caution.
    # pd_tbl._harmonize_columns()

    # Added by me: coerce types
    if coerce_types:
        frame = pd_tbl.frame
        for col, dtype in coerce_types.iteritems():
            frame[col] = frame[col].astype(dtype, copy=False)

    return pd_tbl.frame