예제 #1
0
    def to_redshift(self, table_name, username=None, password=None, host=None,
                    db=None, port=None, **copy_args):
        """
        Write a table to a Redshift database. Note, this requires you to pass
        AWS S3 credentials or store them as environmental variables.

        Args:
            username: str
                Required if env variable ``REDSHIFT_USERNAME`` not populated
            password: str
                Required if env variable ``REDSHIFT_PASSWORD`` not populated
            host: str
                Required if env variable ``REDSHIFT_HOST`` not populated
            db: str
                Required if env variable ``REDSHIFT_DB`` not populated
            port: int
                Required if env variable ``REDSHIFT_PORT`` not populated. Port 5439 is typical.
            \**copy_args: kwargs
                See :func:`~parsons.databases.Redshift.copy`` for options.

        Returns:
            ``None``
        """  # noqa: W605

        from parsons import Redshift
        rs = Redshift(
            username=username, password=password, host=host, db=db, port=port)
        rs.copy(self, table_name, **copy_args)
예제 #2
0
    def from_redshift(cls, query, username=None, password=None, host=None,
                      db=None, port=None):
        """
        Create a ``parsons table`` from a Redshift query.

        To pull an entire Redshift table, use a query like ``SELECT * FROM tablename``.

        `Args:`
            query: str
                A valid SQL statement
            username: str
                Required if env variable ``REDSHIFT_USERNAME`` not populated
            password: str
                Required if env variable ``REDSHIFT_PASSWORD`` not populated
            host: str
                Required if env variable ``REDSHIFT_HOST`` not populated
            db: str
                Required if env variable ``REDSHIFT_DB`` not populated
            port: int
                Required if env variable ``REDSHIFT_PORT`` not populated. Port 5439 is typical.

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        from parsons import Redshift
        rs = Redshift(username=username, password=password, host=host, db=db, port=port)
        return rs.query(query)
예제 #3
0
    def setUp(self):

        self.temp_schema = TEMP_SCHEMA

        self.rs = Redshift()

        self.tbl = Table([['ID', 'Name'], [1, 'Jim'], [2, 'John'],
                          [3, 'Sarah']])

        # Create a schema, create a table, create a view
        setup_sql = f"""
                    drop schema if exists {self.temp_schema} cascade;
                    create schema {self.temp_schema};
                    """

        other_sql = f"""
                    create table {self.temp_schema}.test (id smallint,name varchar(5));
                    create view {self.temp_schema}.test_view as (select * from {self.temp_schema}.test);
                    """ # noqa: E501

        self.rs.query(setup_sql)

        self.rs.query(other_sql)

        self.s3 = S3()

        self.temp_s3_bucket = os.environ['S3_TEMP_BUCKET']
        self.temp_s3_prefix = 'test/'
예제 #4
0
    def setUp(self):

        self.rs = Redshift(username='******', password='******', host='test', db='test', port=123)

        self.tbl = Table([['ID', 'Name'],
                          [1, 'Jim'],
                          [2, 'John'],
                          [3, 'Sarah']])

        self.tbl2 = Table([
            ["c1", "c2", "c3", "c4", "c5", "c6", "c7"],
            ["a", "", 1, "NA", 1.4, 1, 2],
            ["b", "", 2, "NA", 1.4, 1, 2],
            ["c", "", 3.4, "NA", "", "", "a"],
            ["d", "", 5, "NA", 1.4, 1, 2],
            ["e", "", 6, "NA", 1.4, 1, 2],
            ["f", "", 7.8, "NA", 1.4, 1, 2],
            ["g", "", 9, "NA", 1.4, 1, 2],
        ])

        self.mapping = self.rs.generate_data_types(self.tbl)
        self.rs.DO_PARSE_BOOLS = True
        self.mapping2 = self.rs.generate_data_types(self.tbl2)
        self.rs.DO_PARSE_BOOLS = False
        self.mapping3 = self.rs.generate_data_types(self.tbl2)
예제 #5
0
def main():
    setup_environment()

    #create an instance of s3 and redshift

    s3 = S3()
    rs = Redshift()

    STAGE2_GLOB = 'stage2.*.csv'

    SCHEMA = {
        'congressional_district': np.float64,
        'state_house_district': np.float64,
        'state_senate_district': np.float64,
        'n_guns_involved': np.float64,
    }

    def load_csv(csv_fname):
        return pd.read_csv(csv_fname,
                           dtype=SCHEMA,
                           parse_dates=['date'],
                           encoding='utf-8')

    def inner_sort(dfs):
        for df in dfs:
            assert all(~df['date'].isna())
            df.sort_values('date', inplace=True)

    def outer_sort(dfs):
        # If the first incident in one file took place earlier than the first incident in another,
        # we assume all incidents in the former took place earlier than all incidents in the latter.
        dfs.sort(key=lambda df: df.loc[0].date)

    def main():
        # Sort the dataframes by ascending date, then sort by ascending date *within* each dataframe,
        # then merge into 1 giant CSV.
        dfs = [load_csv(fname) for fname in glob(STAGE2_GLOB)]
        inner_sort(dfs)
        outer_sort(dfs)

    giant_df = pd.concat(dfs, ignore_index=True)
    giant_df.to_csv('stage3.csv',
                    index=False,
                    float_format='%g',
                    encoding='utf-8')

    # Convert dataframe to a parsons table

    final_table = Table.from_dataframe(giant_df)

    #Push table to redshift

    final_table.to_redshift('cjaf_gvp.gva_2019_data', if_exists='drop')

    logger.info(f"Successfully created GVA 2019 Data Table")
예제 #6
0
    def setUp(self):

        self.rs = Redshift(username='******',
                           password='******',
                           host='test',
                           db='test',
                           port=123)

        self.tbl = Table([['ID', 'Name'], [1, 'Jim'], [2, 'John'],
                          [3, 'Sarah']])

        self.mapping = self.rs.generate_data_types(self.tbl)
예제 #7
0
    def setUp(self):

        self.temp_schema = TEMP_SCHEMA
        self.db = Redshift()

        # Create a schema.
        setup_sql = f"""
                     DROP SCHEMA IF EXISTS {self.temp_schema} CASCADE;
                     CREATE SCHEMA {self.temp_schema};
                     """
        self.db.query(setup_sql)

        # Load dummy data to parsons tables
        self.table1 = Table.from_csv(f'{_dir}/test_data/sample_table_1.csv')
        self.table2 = Table.from_csv(f'{_dir}/test_data/sample_table_2.csv')

        # Create source table
        self.db.copy(self.table1, f'{self.temp_schema}.source')

        # Create DB Sync object
        self.db_sync = DBSync(self.db, self.db)
예제 #8
0
import os
import logging
from parsons import Redshift, S3, utilities

# Redshift setup - this assumes a Civis Platform parameter called "REDSHIFT"

set_env_var(os.environ['REDSHIFT_PORT'])
set_env_var(os.environ['REDSHIFT_DB'])
set_env_var(os.environ['REDSHIFT_HOST'])
set_env_var(os.environ['REDSHIFT_CREDENTIAL_USERNAME'])
set_env_var(os.environ['REDSHIFT_CREDENTIAL_PASSWORD'])
rs = Redshift()

# AWS setup - this assumes a Civis Platform parameter called "AWS"

set_env_var('S3_TEMP_BUCKET', 'parsons-tmc')
set_env_var('AWS_ACCESS_KEY_ID', os.environ['AWS_ACCESS_KEY_ID'])
set_env_var('AWS_SECRET_ACCESS_KEY', os.environ['AWS_SECRET_ACCESS_KEY'])
s3 = S3()

# Logging

logger = logging.getLogger(__name__)
_handler = logging.StreamHandler()
_formatter = logging.Formatter('%(levelname)s %(message)s')
_handler.setFormatter(_formatter)
logger.addHandler(_handler)
logger.setLevel('INFO')

bucket = os.environ['BUCKET']
schema = os.environ['SCHEMA']
예제 #9
0
}

### CODE

from parsons import Table, Redshift, VAN
from parsons import logger
import os

# Setup

for name, value in config_vars.items(
):  # sets variables if provided in this script
    if value.strip() != "":
        os.environ[name] = value

rs = Redshift(
)  # just create Redshift() - VAN connector is created dynamically below

# Create dictionary of VAN states and API keys from multiline Civis credential

myv_states = {
    x.split(",")[0]: x.split(",")[1]
    for x in os.environ['VAN_PASSWORD'].split("\r\n")
}
myv_keys = {
    k: VAN(api_key=v, db=os.environ['VAN_DB_NAME'])
    for k, v in myv_states.items()
}

# Create simple set of states for insertion into SQL
states = "','".join([s for s in myv_keys])