示例#1
0
def pull_and_insert(**kwargs):

    #Pull query from sql directory
    query = kwargs['templates_dict']['query']

    #Initialize snql hook and pull data
    snql_hook = PostgresHook('snql')
    results = snql_hook.get_records(query)

    #Initialize staging hook and insert data to staging table
    staging_hook = PostgresHook('snql_staging')
    staging_hook.insert_rows('dim_sneakers', results)
示例#2
0
def create_staging():

    create_sql = """CREATE TABLE IF NOT EXISTS dim_sneakers(
                sneaker_id INT,
                sneaker_name VARCHAR,
                color VARCHAR,
                created_at TIMESTAMP,
                is_owned BOOLEAN,
                sold_at TIMESTAMP,
                trashed_at TIMESTAMP,
                given_at TIMESTAMP,
                manufacturer_name VARCHAR,
                collaborator_name VARCHAR,
                count_wears INT,
                count_cleans INT,
                count_walks INT,
                updated_at TIMESTAMP);"""

    #is_current BOOLEAN

    #Initialize staging hook
    staging_hook = PostgresHook('snql_staging')

    #Run create table query
    staging_hook.run(create_sql)

    #Run delete records query
    staging_hook.run("""DELETE FROM dim_sneakers""")
示例#3
0
def create_import_table(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    sql = """
        CREATE TABLE IF NOT EXISTS trips (
            trip_id SERIAL8 PRIMARY KEY,
            city VARCHAR(16),
            duration_sec INT4,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            start_station_id FLOAT4,
            start_station_name VARCHAR(256),
            start_location_latitude FLOAT8,
            start_location_longitude FLOAT8,
            end_station_id FLOAT4,
            end_station_name VARCHAR(256),
            end_location_latitude FLOAT8,
            end_location_longitude FLOAT8,
            bike_id INT4,
            user_type VARCHAR(16),
            member_birth_year FLOAT4,
            member_gender VARCHAR(16)
            );
        """
    pg_hook.run(sql)
示例#4
0
def create_target():

    create_sql = """CREATE TABLE IF NOT EXISTS dim_sneakers(
                sneaker_id INT,
                sneaker_name VARCHAR,
                color VARCHAR,
                created_at TIMESTAMP,
                is_owned BOOLEAN,
                sold_at TIMESTAMP,
                trashed_at TIMESTAMP,
                given_at TIMESTAMP,
                manufacturer_name VARCHAR,
                collaborator_name VARCHAR,
                count_wears INT,
                count_cleans INT,
                count_walks INT,
                updated_at TIMESTAMP);"""

    #is_current BOOLEAN

    #Initialize target hook
    snql_hook = PostgresHook('snql')

    #Run create sql
    snql_hook.run(create_sql)
    def execute(self, context):

        if self.dag_dependencies:
            self.check_for_dependencies()

        tables_sql = \
        """
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = '{0}'
        """.format(self.db_schema)

        hook = PostgresHook(self.db_conn_id)

        records = [record[0] for record in hook.get_records(tables_sql)]

        for record in records:
            if self.key in record:
                logging.info('Dropping: {}.{}'.format(str(self.db_schema),
                                                      str(self.key)))
                drop_sql = \
                """
                DROP TABLE {0}.{1}
                """.format(self.db_schema, record)

                hook.run(drop_sql)
示例#6
0
def get_news_meta(**kwargs):
    '''
    Retrieve metadata for news articles published on our set
    of sources during the time period between our last crawl
    and the current execution time.
    '''
    exec_date = kwargs.get('ds') or datetime.now()
    start_date = exec_date - timedelta(hours=3)

    api_key = kwargs.get('api_key')
    api = NewsAPI(api_key)

    query_args = {
        'sources': SOURCE_LIST,
        'page_size': 100,
        'from': convert_time(start_date),
        'to': convert_time(exec_date),
        'language': 'en'
    }

    pg_hook = PostgresHook(postgres_conn_id='sumzero')
    pagination = api.paginate_articles(**query_args)

    for resp in pagination:
        meta_list = resp.articles
        if meta_list:
            rows = (meta_to_row(d) for d in meta_list)
            pg_hook.insert_rows('news_meta1',
                                rows=rows,
                                target_fields=('title', 'description', 'url',
                                               'published_on'),
                                commit_every=1000)

    return meta_list
示例#7
0
def get_rates(ds, **kwargs):
    pg_hook = PostgresHook(postgres_conn_id='rates')
    api_hook = HttpHook(http_conn_id='openexchangerates', method='GET')

    # If either of these raises an exception then we'll be notified via
    # Airflow
    resp = api_hook.run('')
    resp = json.loads(resp.content)

    # These are the only valid pairs the DB supports at the moment. Anything
    # else that turns up will be ignored.
    valid_pairs = (
        'AED', 'AFN', 'ALL', 'AMD', 'ANG', 'AOA', 'ARS',
        'AUD', 'AWG', 'AZN', 'BAM', 'BBD', 'BDT', 'BGN',
        'BHD', 'BIF', 'BMD', 'BND', 'BOB', 'BRL', 'BSD',
        'BTC', 'BTN', 'BWP', 'BYN', 'BYR', 'BZD', 'CAD',
        'CDF', 'CHF', 'CLF', 'CLP', 'CNY', 'COP', 'CRC',
        'CUC', 'CUP', 'CVE', 'CZK', 'DJF', 'DKK', 'DOP',
        'DZD', 'EEK', 'EGP', 'ERN', 'ETB', 'EUR', 'FJD',
        'FKP', 'GBP', 'GEL', 'GGP', 'GHS', 'GIP', 'GMD',
        'GNF', 'GTQ', 'GYD', 'HKD', 'HNL', 'HRK', 'HTG',
        'HUF', 'IDR', 'ILS', 'IMP', 'INR', 'IQD', 'IRR',
        'ISK', 'JEP', 'JMD', 'JOD', 'JPY', 'KES', 'KGS',
        'KHR', 'KMF', 'KPW', 'KRW', 'KWD', 'KYD', 'KZT',
        'LAK', 'LBP', 'LKR', 'LRD', 'LSL', 'LTL', 'LVL',
        'LYD', 'MAD', 'MDL', 'MGA', 'MKD', 'MMK', 'MNT',
        'MOP', 'MRO', 'MTL', 'MUR', 'MVR', 'MWK', 'MXN',
        'MYR', 'MZN', 'NAD', 'NGN', 'NIO', 'NOK', 'NPR',
        'NZD', 'OMR', 'PAB', 'PEN', 'PGK', 'PHP', 'PKR',
        'PLN', 'PYG', 'QAR', 'RON', 'RSD', 'RUB', 'RWF',
        'SAR', 'SBD', 'SCR', 'SDG', 'SEK', 'SGD', 'SHP',
        'SLL', 'SOS', 'SRD', 'STD', 'SVC', 'SYP', 'SZL',
        'THB', 'TJS', 'TMT', 'TND', 'TOP', 'TRY', 'TTD',
        'TWD', 'TZS', 'UAH', 'UGX', 'USD', 'UYU', 'UZS',
        'VEF', 'VND', 'VUV', 'WST', 'XAF', 'XAG', 'XAU',
        'XCD', 'XDR', 'XOF', 'XPD', 'XPF', 'XPT', 'YER',
        'ZAR', 'ZMK', 'ZMW', 'ZWL')

    rates_insert = """INSERT INTO rates (pair, valid_until, rate)
                      VALUES (%s, %s, %s);"""

    # If this raises an exception then we'll be notified via Airflow
    valid_until = datetime.fromtimestamp(resp['timestamp'])

    for (iso2, rate) in resp['rates'].iteritems():
        # If converting the rate to a float fails for whatever reason then
        # just move on.
        try:
            rate = float(rate)
        except:
            continue

        iso2 = iso2.upper().strip()

        if iso2 not in valid_pairs or rate < 0:
            continue

        pg_hook.run(rates_insert, parameters=(iso2,
                                              valid_until,
                                              rate))
示例#8
0
def insert_events(**kwargs):
    # airflow format 2020-10-22T00:00:00+00:00 '%Y-%m-%dT%H:%M:%S%z'
    # json file format 2020-10-22 20:36:00.178590 '%Y-%m-%d %H:%M:%S.%f'
    execution_time = kwargs['execution_date']
    execution_time_str = datetime.strftime(execution_time,
                                           '%Y-%m-%d %H:%M:%S.%f')
    print('execution_time: ', execution_time_str)
    next_execution_time = kwargs['next_execution_date']
    next_execution_time_str = datetime.strftime(next_execution_time,
                                                '%Y-%m-%d %H:%M:%S.%f')
    print('next_execution_time: ', next_execution_time_str)

    params = {
        'time_from': execution_time_str,
        'time_to': next_execution_time_str
    }

    events = requests.get('http://nginx:80/events', params)
    print(events.json())

    pg_hook = PostgresHook(postgres_conn_id='pg_dw')
    query = """
        INSERT INTO events_stg (event, time, unique_visitor_id, ha_user_id, browser, os, country_code)
        VALUES ('{}', '{}', NULLIF('{}', 'None'), NULLIF('{}', 'None'), NULLIF('{}', 'None'), NULLIF('{}', 'None'), NULLIF('{}', 'None'))
    """
    for event in events.json():
        clean_event = process_event(event)
        pg_hook.run(
            query.format(clean_event['event'], clean_event['time'],
                         clean_event['unique_visitor_id'],
                         clean_event['ha_user_id'], clean_event['browser'],
                         clean_event['os'], clean_event['country_code']))
 def execute(self, context):
     postgres_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     s3_hook = S3Hook(aws_conn_id=self.s3_conn_id)
     res = self.query_db(self.query, postgres_hook)
     res.seek(0)
     s3_hook.load_file_obj(res, key="egress/sources.airflow.csv", bucket_name="demo-bucket-temp-977338899", replace=True)
     return True
示例#10
0
def create_import_table(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    commands = ("""
        CREATE TABLE IF NOT EXISTS trips (
            trip_id SERIAL8 PRIMARY KEY,
            city VARCHAR(16),
            duration_sec INT4,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            start_station_id FLOAT4,
            start_station_name VARCHAR(256),
            start_location_latitude FLOAT8,
            start_location_longitude FLOAT8,
            end_station_id FLOAT4,
            end_station_name VARCHAR(256),
            end_location_latitude FLOAT8,
            end_location_longitude FLOAT8,
            bike_id INT4,
            user_type VARCHAR(16),
            member_birth_year FLOAT4,
            member_gender VARCHAR(16)
            );
        """, """
        CREATE TABLE IF NOT EXISTS acs (
        fips VARCHAR(19) PRIMARY KEY,
        bach_edu FLOAT8,
        tot_trips FLOAT8,
        trips_u10 FLOAT8,
        trips_1014 FLOAT8,
        trips_1519 FLOAT8,
        med_age FLOAT8,
        med_age_m FLOAT8,
        med_age_f FLOAT8,
        pop_tot FLOAT8,
        race_white FLOAT8,
        race_black FLOAT8,
        asian FLOAT8,
        smartphone FLOAT8,
        car FLOAT8,
        transit FLOAT8,
        bike FLOAT8,
        walk FLOAT8,
        male_tot FLOAT8,
        fmale_tot FLOAT8,
        units_tot FLOAT8,
        owned FLOAT8,
        rented FLOAT8,
        med_hh_inc FLOAT8,
        med_rent FLOAT8
        );

        COPY acs FROM '/home/ubuntu/bikeiq/data/acs.csv' DELIMITER ',' CSV HEADER;
        
        ALTER TABLE acs ADD COLUMN bg_id VARCHAR(12);
        UPDATE acs SET bg_id=substring(fips, 8);
        """)
    for command in commands:
        pg_hook.run(command)
示例#11
0
def populate_target(**kwargs):

    #Pull data from staging
    staging_hook = PostgresHook('snql_staging')
    #Pull query from sql directory
    query = kwargs['templates_dict']['query']
    #Execute query
    staging_results = staging_hook.get_records(query)

    #Initialize hook to snql
    snql_hook = PostgresHook('snql')

    #Delete current rows in target table table
    snql_hook.run("""DELETE FROM dim_sneakers""")

    #Insert new rows into target table
    snql_hook.insert_rows('dim_sneakers', staging_results)
示例#12
0
def process_customers_order_dim(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    sql = "select distinct customer.cust_name, order_info.order_id from customer, order_info where customer.customer_id=order_info.customer_id;"

    records = pg_hook.get_records(sql)
    print(records)

    return records
示例#13
0
def cache_latest_rates(ds, **kwargs):
    redis_conn = redis.StrictRedis(host='redis')
    pg_hook = PostgresHook(postgres_conn_id='rates')
    latest_rates = """SELECT DISTINCT ON (pair)
                             pair, rate
                      FROM   rates
                      ORDER  BY pair, valid_until DESC;"""

    for iso2, rate in pg_hook.get_records(latest_rates):
        redis_conn.set(iso2, rate)
示例#14
0
def cache_latest_stocks(ds, **kwargs):
    redis_conn = redis.StrictRedis(host='redis')
    pg_hook = PostgresHook(postgres_conn_id='stocks')
    latest_stocks = """SELECT DISTINCT ON (symbol)
                             symbol, price
                      FROM   stocks
                      ORDER  BY symbol, valid_until DESC;"""

    for iso2, stock in pg_hook.get_records(latest_stocks):
        redis_conn.set(iso2, stock)
示例#15
0
    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        if self.create_tunnel is True:
            self.create_ssh_tunnel()

        print("EXECUTING QUERY!")

        time.sleep(10)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                              schema=self.database)
     self.hook.run(self.sql, self.autocommit)
     for output in self.hook.conn.notices:
         self.log.info(output)
     logging.info('Executing: ' + self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     self.hook.run(self.sql, self.autocommit)
     records = self.hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
     if len(records) < 1 or len(records[0]) < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} returned no results")
     num_records = records[0][0]
     if num_records < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} contained 0 rows")
     self.log.info(
         f'Data quality on table {self.table} check passed with {records[0][0]} records'
     )
示例#17
0
def insert_table(conn_id, schema, src_table, dest_table):

    pg_hook = PostgresHook(postgres_conn_id=conn_id)
    select_cmd = """SELECT relname, n_live_tup from {} where schemaname = '{}'
    and relname != '{}';""".format(src_table, schema, dest_table)
    src_conn = pg_hook.get_conn()
    src_cursor = src_conn.cursor()
    src_cursor.execute(select_cmd)
    records = src_cursor.fetchall()
    dest_conn = pg_hook.get_conn()
    dest_cursor = dest_conn.cursor()
    execute_values(dest_cursor, "INSERT INTO {} VALUES %s".format(dest_table),
                   records)
    dest_conn.commit()
示例#18
0
def get_event_status(**kwargs):
    cur = PostgresHook('airflow_db').get_cursor()

    sql = """SELECT
              task_id,
              dag_id,
              to_Char(dttm, 'HH24:MI:SS')
             FROM public.log
             WHERE owner = 'airflow' 
                   AND event = 'failed'
                   AND dttm >= current_date - interval '5 minutes'"""

    cur.execute(sql)
    result = cur.fetchall()
    return result
示例#19
0
def get_news_articles(**kwargs):
    '''
    Python callable for article scraping task.
    This will get executed after news metadata has been
    collected and validated for the current execution period.

    Target URLs will be retrieved from Postgres for articles
    published after the last run date of this task. Articles
    will be written to file.
    '''
    # TODO:
    # Decide how to store articles in FS (method and schema)
    # Decide how to name log files
    # Decide whether to implement compression task

    pg_hook = PostgresHook(postgres_conn_id='sumzero')
    ds = kwargs.get('ds') or datetime.now().isoformat()
    '''
    query_results = pg_hook.get_records(

        SELECT URL FROM news_meta
            WHERE published_on > %s;
        ,
        parameters=[ds]
    )
    urls = [record[0] for record in query_results]
    '''
    urls = kwargs.get('urls')

    # Download articles and write them to a csv.
    time_str = datetime.now().isoformat()
    log_fname = 'test_{}.csv'.format(time_str)
    failed_log_fname = 'failed_test_{}.csv'.format(time_str)

    log_file = open(log_fname, 'w+')
    log_writer = csv.writer(log_file, delimiter=',', quotechar='|')
    failed_log = open(failed_log_fname, 'w+')
    failed_writer = csv.writer(failed_log, delimiter=',', quotechar='|')

    for url in urls:
        loader = Article(url)
        try:
            loader.download()
            loader.parse()
            log_writer.writerow((time_str, loader.text))
        except Exception as error:
            failed_writer.writerow((time_str, str(error), url))
示例#20
0
def load_data(ds, **kwargs):
    """
	Processes the json data, checks the types and enters into the
	Postgres database.
	"""

    pg_hook = PostgresHook(postgres_conn_id='weather_id')

    file_name = str(datetime.now().date()) + '.json'
    tot_name = os.path.join(os.path.dirname(__file__), 'src/data', file_name)

    # open the json datafile and read it in
    with open(tot_name, 'r') as inputfile:
        doc = json.load(inputfile)

    # transform the data to the correct types and convert temp to celsius
    city = str(doc['name'])
    country = str(doc['sys']['country'])
    lat = float(doc['coord']['lat'])
    lon = float(doc['coord']['lon'])
    humid = float(doc['main']['humidity'])
    press = float(doc['main']['pressure'])
    min_temp = float(doc['main']['temp_min']) - 273.15
    max_temp = float(doc['main']['temp_max']) - 273.15
    temp = float(doc['main']['temp']) - 273.15
    weather = str(doc['weather'][0]['description'])
    todays_date = datetime.now().date()

    # check for nan's in the numeric values and then enter into the database
    valid_data = True
    for valid in np.isnan([lat, lon, humid, press, min_temp, max_temp, temp]):
        if valid is False:
            valid_data = False
            break

    row = (city, country, lat, lon, todays_date, humid, press, min_temp,
           max_temp, temp, weather)

    insert_cmd = """INSERT INTO weather_table 
					(city, country, latitude, longitude,
					todays_date, humidity, pressure, 
					min_temp, max_temp, temp, weather)
					VALUES
					(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""

    if valid_data is True:
        pg_hook.run(insert_cmd, parameters=row)
示例#21
0
def create_table(conn_id, dest_table):

    pg_hook = PostgresHook(postgres_conn_id=conn_id)
    delete_cmd = """
                DROP TABLE IF EXISTS {};
                 """.format(dest_table)
    create_cmd = """
                 CREATE table {} (
                 table_name VARCHAR(50),
                 row_count INTEGER
                 )
                 """.format(dest_table)
    dest_conn = pg_hook.get_conn()
    delete_cursor = dest_conn.cursor()
    delete_cursor.execute(delete_cmd)
    dest_cursor = dest_conn.cursor()
    dest_cursor.execute(create_cmd)
    dest_conn.commit()
示例#22
0
def api_json_to_db(**kwargs):
    base_dir = Variable.get("tfl_park_base_dir")

    json_files = [
        f for f in listdir(base_dir) if isfile(join(base_dir, f))
        if '.json' in f
    ]

    db_tuples = []

    for current_file in json_files:
        time_pattern = re.compile('[0-9]*')
        pattern_match = time_pattern.match(current_file)

        if pattern_match is not None:
            timestamp = pattern_match.group()

            with open(join(base_dir, current_file), 'r') as f:
                car_parks = json.loads(f.read())

            for car_park in car_parks:
                park_id = car_park['id']
                park_name = car_park['name']

                for bay in car_park['bays']:
                    park_bay_type = bay['bayType']
                    park_bay_occupied = bay['occupied']
                    park_bay_free = bay['free']
                    park_bay_count = bay['bayCount']

                    db_tuples.append(
                        (timestamp, park_id, park_name, park_bay_type,
                         park_bay_occupied, park_bay_free, park_bay_count))

    target_fields = [
        't_stamp', 'park_id', 'park_name', 'park_bay_type',
        'park_bay_occupied', 'park_bay_free', 'park_bay_count'
    ]

    db_hook = PostgresHook(postgres_conn_id='tfl_db')
    db_hook.insert_rows('tfl_data', db_tuples, target_fields)

    for current_file in json_files:
        remove(join(base_dir, current_file))
示例#23
0
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        conn = self.hook.get_conn()
        log.info("Connected with " + self.redshift_conn_id)

        table = self.table
        s3_path = self.s3_path
        s3_role = self.s3_role
        s3_region = self.s3_region
        delimiter = self.delimiter
        encoding = self.encoding

        copy_statement = f"""
        COPY {table}
        FROM '{s3_path}'
        IAM_ROLE '{s3_role}'
        REGION '{s3_region}'
        DELIMITER '{delimiter}'
        ENCODGIN '{encoding}'
        """
        cursor = conn.cursor()
        cursor.execute(copy_statement)
        cursor.close()

        load_datetime = self.load_datetime
        record_source = self.record_source

        log.info("Loaded completed with load_datetime " + load_datetime)

        meta_statement = f"""
        UPDATE {table}
        SET LOAD_DATETIME = '{load_datetime}'',
        RECORD_SOURCE = '{record_source}''
        WHERE LOAD_DATETIME IS NULL 
        AND RECORD_SOURCE IS NULL
        """
        cursor = conn.cursor()
        cursor.execute(meta_statement)
        cursor.close()
        log.info("Metadata injection completed with load_datetime " +
                 load_datetime)

        conn.commit()
示例#24
0
def monthly_spent(TaskName , ds, prev_ds, **kwargs  ):
    facebook = Variable.get("facebook_auth", deserialize_json=True)
    access_token =  facebook["access_token"]
    ad_account_id =  facebook["ad_account_id"]  
    FacebookAdsApi.init(access_token=access_token)

    fromDate = prev_ds
    toDate = (datetime.strptime(ds, '%Y-%m-%d') + timedelta(days=-1)).strftime('%Y-%m-%d') 

    fields = [ 'spend', 'account_id', ]
    params = { 'level': 'account', 'time_range': {'since': fromDate ,'until': toDate },}
    spendrows = AdAccount(ad_account_id).get_insights(    fields=fields,    params=params,)

    total = 0.0 
    for row in spendrows:
        total = total + float ( row["spend"])
    row = (total, ds)

    connDB =  PostgresHook (postgres_conn_id = 'airflow')
    FB_json_conn = BaseHook.get_connection('FB_json_conn').extra_dejson
    scommand = "insert into monthly_cost values( %s ,%s) "
    connDB.run(scommand, parameters =row)
    print ( "done, Cost from "+ prev_ds+ " to " + ds )
示例#25
0
def get_stocks(ds, **context):
    symbol = context['params']['symbol']

    pg_hook = PostgresHook(postgres_conn_id='stocks')
    api_hook = HttpHook(http_conn_id='alphavantage', method='GET')

    # If either of these raises an exception then we'll be notified via
    # Airflow
    resp = api_hook.run(f'query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey=537201H9R203WT4C&datatype=csv')
    resp = json.loads(resp.content)

    # These are the only valid stocks the DB supports at the moment. Anything
    # else that turns up will be ignored.

    stocks_insert = """INSERT INTO stocks (symbol, valid_until, price)
                      VALUES (%s, %s, %s);"""

    # If this raises an exception then we'll be notified via Airflow
    valid_until = datetime.fromtimestamp(resp['timestamp'])

    for iso2, price in resp['stocks'].items():
        # If converting the price to a float fails for whatever reason then
        # just move on.
        try:
            price = float(price)
        except:
            continue

        iso2 = iso2.upper().strip()

        if iso2 not in stocks or price < 0:
            continue

        pg_hook.run(stocks_insert, parameters=(iso2,
                                               valid_until,
                                               price))
示例#26
0
 def execute(self, context):
     logging.info('Executing: ' + str(self.sql))
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
示例#27
0
def create_tables():
    conn = PostgresHook(postgres_conn_id='sumzero')
    conn.run(CREATE_TABLE_QUERY)
示例#28
0
def create_tres(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    sql = """CREATE TABLE IF NOT EXISTS tres (id_id INT8 PRIMARY KEY) DISTRIBUTE BY HASH(id_id);"""
    pg_hook.run(sql)
示例#29
0
def delete_from_staging():

    staging_hook = PostgresHook('snql_staging')
    staging_hook.run('DELETE FROM dim_sneakers;')
示例#30
0
def ETL_run(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)

    # get s3 iterators
    def get_matching_s3_objects(bucket, prefix='', suffix=''):
        """
        Generate objects in an S3 bucket.

        :param bucket: Name of the S3 bucket.
        :param prefix: Only fetch objects whose key starts with
            this prefix (optional).
        :param suffix: Only fetch objects whose keys end with
            this suffix (optional).
        """
        s3 = boto3.client('s3')
        kwargs = {'Bucket': bucket}

        # If the prefix is a single string (not a tuple of strings), we can
        # do the filtering directly in the S3 API.
        if isinstance(prefix, str):
            kwargs['Prefix'] = prefix

        while True:

            # The S3 API response is a large blob of metadata.
            # 'Contents' contains information about the listed objects.
            resp = s3.list_objects_v2(**kwargs)

            try:
                contents = resp['Contents']
            except KeyError:
                return

            for obj in contents:
                key = obj['Key']
                if key.startswith(prefix) and key.endswith(suffix):
                    yield obj

            # The S3 API is paginated, returning up to 1000 keys at a time.
            # Pass the continuation token into the next response, until we
            # reach the final page (when this field is missing).
            try:
                kwargs['ContinuationToken'] = resp['NextContinuationToken']
            except KeyError:
                break

    def get_matching_s3_keys(bucket, prefix='', suffix=''):
        """
        Generate the keys in an S3 bucket.

        :param bucket: Name of the S3 bucket.
        :param prefix: Only fetch keys that start with this prefix (optional).
        :param suffix: Only fetch keys that end with this suffix (optional).
        """
        for obj in get_matching_s3_objects(bucket, prefix, suffix):
            yield obj['Key']

    s3_bucket = "s3://sharedbikedata/"

    # load cities
    city_lookup = {
        "SanFrancisco/": "San Francisco",
        "NewYork/": "New York",
        "Boston/": "Boston",
        "Washington/": "Washington",
        "Chicago/": "Chicago"
    }

    def load_city(prefix):
        commands = ("""
            COPY trips (city,
                            duration_sec,
                            start_time,
                            end_time,
                            start_station_id,
                            start_station_name,
                            start_location_latitude,
                            start_location_longitude,
                            end_station_id,
                            end_station_name,
                            end_location_latitude,
                            end_location_longitude,
                            bike_id,
                            user_type,
                            member_birth_year,
                            member_gender) 
            FROM '/tmp/city_trip_data.csv' DELIMITER ',' CSV HEADER;
            """, """
            ALTER TABLE trips ADD COLUMN start_station_point geometry(Point, 4269);
            ALTER TABLE trips ADD COLUMN end_station_point geometry(Point, 4269);
            UPDATE trips SET start_station_point=st_SetSrid(st_MakePoint(start_location_longitude, start_location_latitude), 4269);
            UPDATE trips SET end_station_point=st_SetSrid(st_MakePoint(end_location_longitude, end_location_latitude), 4269);
            """, """
            ALTER TABLE trips ADD COLUMN start_fips VARCHAR(12);
            ALTER TABLE trips ADD COLUMN end_fips VARCHAR(12);

            UPDATE trips SET start_fips=fips.bg_id
            FROM
            (SELECT trips.trip_id, bg.bg_id
            FROM trips, bg
            WHERE ST_Contains(bg.the_geom,trips.start_station_point)
            ) AS fips
            WHERE trips.trip_id = fips.trip_id;
            
            UPDATE trips SET end_fips=fips.bg_id
            FROM
            (SELECT trips.trip_id, bg.bg_id
            FROM trips, bg
            WHERE ST_Contains(bg.the_geom,trips.end_station_point)
            ) AS fips
            WHERE trips.trip_id = fips.trip_id;
            """)

        #iterate over files (time, city)
        for key in get_matching_s3_keys(bucket='sharedbikedata',
                                        prefix=prefix,
                                        suffix=('.zip', '.csv')):
            print(key)
            s3_path = [s3_bucket, key]
            df = pd.read_csv("".join(s3_path))  # get dataset from s3 to pd.df
            df.columns = df.columns.str.strip().str.lower().str.replace(
                ' ', '_').str.replace('(', '').str.replace(')', '')

            df.insert(loc=0, column='city', value=city_lookup[prefix])
            if "bike_share_for_all_trip" in df:
                df = df.drop(["bike_share_for_all_trip"], axis=1)

            if "NewYork" in key:
                df.birth_year = pd.to_numeric(df.birth_year, errors='coerce')
            if "Boston" in key:
                df.birth_year = pd.to_numeric(df.birth_year, errors='coerce')
                df.end_station_id = pd.to_numeric(df.end_station_id,
                                                  errors='coerce')
                df.end_station_latitude = pd.to_numeric(
                    df.end_station_latitude, errors='coerce')
                df.end_station_longitude = pd.to_numeric(
                    df.end_station_longitude, errors='coerce')

            #streaming at the remote server will improve IO and network cost
            df.to_csv("/tmp/city_trip_data.csv", index=False)
            os.system("scp /tmp/city_trip_data.csv [email protected]:/tmp/")

            for command in commands:
                pg_hook.run(command)

            break

    load_city(prefix="SanFrancisco/")