示例#1
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: string
    :param sql: the sql code to be executed
    :type sql: string or string pointing to a template file. Fil must have
        a '.sql' extensions.
    """

    __mapper_args__ = {'polymorphic_identity': 'PostgresOperator'}
    template_fields = ('sql', )
    template_ext = ('.sql', )
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 sql,
                 postgres_conn_id='postgres_default',
                 autocommit=False,
                 *args,
                 **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)

        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit

    def execute(self, context):
        logging.info('Executing: ' + self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit)
示例#2
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: string
    :param sql: the sql code to be executed
    :type sql: string or string pointing to a template file. Fil must have
        a '.sql' extensions.
    """

    template_fields = ('sql',)
    template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self, sql,
            postgres_conn_id='postgres_default', autocommit=False,
            *args, **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)

        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit

    def execute(self, context):
        logging.info('Executing: ' + self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit)
    def execute(self, context):

        if self.dag_dependencies:
            self.check_for_dependencies()

        tables_sql = \
        """
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = '{0}'
        """.format(self.db_schema)

        hook = PostgresHook(self.db_conn_id)

        records = [record[0] for record in hook.get_records(tables_sql)]

        for record in records:
            if self.key in record:
                logging.info('Dropping: {}.{}'.format(str(self.db_schema),
                                                      str(self.key)))
                drop_sql = \
                """
                DROP TABLE {0}.{1}
                """.format(self.db_schema, record)

                hook.run(drop_sql)
示例#4
0
def create_target():

    create_sql = """CREATE TABLE IF NOT EXISTS dim_sneakers(
                sneaker_id INT,
                sneaker_name VARCHAR,
                color VARCHAR,
                created_at TIMESTAMP,
                is_owned BOOLEAN,
                sold_at TIMESTAMP,
                trashed_at TIMESTAMP,
                given_at TIMESTAMP,
                manufacturer_name VARCHAR,
                collaborator_name VARCHAR,
                count_wears INT,
                count_cleans INT,
                count_walks INT,
                updated_at TIMESTAMP);"""

    #is_current BOOLEAN

    #Initialize target hook
    snql_hook = PostgresHook('snql')

    #Run create sql
    snql_hook.run(create_sql)
示例#5
0
def get_news_meta(**kwargs):
    '''
    Retrieve metadata for news articles published on our set
    of sources during the time period between our last crawl
    and the current execution time.
    '''
    exec_date = kwargs.get('ds') or datetime.now()
    start_date = exec_date - timedelta(hours=3)

    api_key = kwargs.get('api_key')
    api = NewsAPI(api_key)

    query_args = {
        'sources': SOURCE_LIST,
        'page_size': 100,
        'from': convert_time(start_date),
        'to': convert_time(exec_date),
        'language': 'en'
    }

    pg_hook = PostgresHook(postgres_conn_id='sumzero')
    pagination = api.paginate_articles(**query_args)

    for resp in pagination:
        meta_list = resp.articles
        if meta_list:
            rows = (meta_to_row(d) for d in meta_list)
            pg_hook.insert_rows('news_meta1',
                                rows=rows,
                                target_fields=('title', 'description', 'url',
                                               'published_on'),
                                commit_every=1000)

    return meta_list
示例#6
0
def create_import_table(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    sql = """
        CREATE TABLE IF NOT EXISTS trips (
            trip_id SERIAL8 PRIMARY KEY,
            city VARCHAR(16),
            duration_sec INT4,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            start_station_id FLOAT4,
            start_station_name VARCHAR(256),
            start_location_latitude FLOAT8,
            start_location_longitude FLOAT8,
            end_station_id FLOAT4,
            end_station_name VARCHAR(256),
            end_location_latitude FLOAT8,
            end_location_longitude FLOAT8,
            bike_id INT4,
            user_type VARCHAR(16),
            member_birth_year FLOAT4,
            member_gender VARCHAR(16)
            );
        """
    pg_hook.run(sql)
示例#7
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: string
    :param sql: the sql code to be executed
    :type sql: Can receive a str representing a sql statement,
        a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
    """

    template_fields = ('sql', )
    template_ext = ('.sql', )
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 sql,
                 postgres_conn_id='postgres_default',
                 autocommit=False,
                 parameters=None,
                 *args,
                 **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
示例#8
0
def get_rates(ds, **kwargs):
    pg_hook = PostgresHook(postgres_conn_id='rates')
    api_hook = HttpHook(http_conn_id='openexchangerates', method='GET')

    # If either of these raises an exception then we'll be notified via
    # Airflow
    resp = api_hook.run('')
    resp = json.loads(resp.content)

    # These are the only valid pairs the DB supports at the moment. Anything
    # else that turns up will be ignored.
    valid_pairs = (
        'AED', 'AFN', 'ALL', 'AMD', 'ANG', 'AOA', 'ARS',
        'AUD', 'AWG', 'AZN', 'BAM', 'BBD', 'BDT', 'BGN',
        'BHD', 'BIF', 'BMD', 'BND', 'BOB', 'BRL', 'BSD',
        'BTC', 'BTN', 'BWP', 'BYN', 'BYR', 'BZD', 'CAD',
        'CDF', 'CHF', 'CLF', 'CLP', 'CNY', 'COP', 'CRC',
        'CUC', 'CUP', 'CVE', 'CZK', 'DJF', 'DKK', 'DOP',
        'DZD', 'EEK', 'EGP', 'ERN', 'ETB', 'EUR', 'FJD',
        'FKP', 'GBP', 'GEL', 'GGP', 'GHS', 'GIP', 'GMD',
        'GNF', 'GTQ', 'GYD', 'HKD', 'HNL', 'HRK', 'HTG',
        'HUF', 'IDR', 'ILS', 'IMP', 'INR', 'IQD', 'IRR',
        'ISK', 'JEP', 'JMD', 'JOD', 'JPY', 'KES', 'KGS',
        'KHR', 'KMF', 'KPW', 'KRW', 'KWD', 'KYD', 'KZT',
        'LAK', 'LBP', 'LKR', 'LRD', 'LSL', 'LTL', 'LVL',
        'LYD', 'MAD', 'MDL', 'MGA', 'MKD', 'MMK', 'MNT',
        'MOP', 'MRO', 'MTL', 'MUR', 'MVR', 'MWK', 'MXN',
        'MYR', 'MZN', 'NAD', 'NGN', 'NIO', 'NOK', 'NPR',
        'NZD', 'OMR', 'PAB', 'PEN', 'PGK', 'PHP', 'PKR',
        'PLN', 'PYG', 'QAR', 'RON', 'RSD', 'RUB', 'RWF',
        'SAR', 'SBD', 'SCR', 'SDG', 'SEK', 'SGD', 'SHP',
        'SLL', 'SOS', 'SRD', 'STD', 'SVC', 'SYP', 'SZL',
        'THB', 'TJS', 'TMT', 'TND', 'TOP', 'TRY', 'TTD',
        'TWD', 'TZS', 'UAH', 'UGX', 'USD', 'UYU', 'UZS',
        'VEF', 'VND', 'VUV', 'WST', 'XAF', 'XAG', 'XAU',
        'XCD', 'XDR', 'XOF', 'XPD', 'XPF', 'XPT', 'YER',
        'ZAR', 'ZMK', 'ZMW', 'ZWL')

    rates_insert = """INSERT INTO rates (pair, valid_until, rate)
                      VALUES (%s, %s, %s);"""

    # If this raises an exception then we'll be notified via Airflow
    valid_until = datetime.fromtimestamp(resp['timestamp'])

    for (iso2, rate) in resp['rates'].iteritems():
        # If converting the rate to a float fails for whatever reason then
        # just move on.
        try:
            rate = float(rate)
        except:
            continue

        iso2 = iso2.upper().strip()

        if iso2 not in valid_pairs or rate < 0:
            continue

        pg_hook.run(rates_insert, parameters=(iso2,
                                              valid_until,
                                              rate))
示例#9
0
def insert_events(**kwargs):
    # airflow format 2020-10-22T00:00:00+00:00 '%Y-%m-%dT%H:%M:%S%z'
    # json file format 2020-10-22 20:36:00.178590 '%Y-%m-%d %H:%M:%S.%f'
    execution_time = kwargs['execution_date']
    execution_time_str = datetime.strftime(execution_time,
                                           '%Y-%m-%d %H:%M:%S.%f')
    print('execution_time: ', execution_time_str)
    next_execution_time = kwargs['next_execution_date']
    next_execution_time_str = datetime.strftime(next_execution_time,
                                                '%Y-%m-%d %H:%M:%S.%f')
    print('next_execution_time: ', next_execution_time_str)

    params = {
        'time_from': execution_time_str,
        'time_to': next_execution_time_str
    }

    events = requests.get('http://nginx:80/events', params)
    print(events.json())

    pg_hook = PostgresHook(postgres_conn_id='pg_dw')
    query = """
        INSERT INTO events_stg (event, time, unique_visitor_id, ha_user_id, browser, os, country_code)
        VALUES ('{}', '{}', NULLIF('{}', 'None'), NULLIF('{}', 'None'), NULLIF('{}', 'None'), NULLIF('{}', 'None'), NULLIF('{}', 'None'))
    """
    for event in events.json():
        clean_event = process_event(event)
        pg_hook.run(
            query.format(clean_event['event'], clean_event['time'],
                         clean_event['unique_visitor_id'],
                         clean_event['ha_user_id'], clean_event['browser'],
                         clean_event['os'], clean_event['country_code']))
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: string
    :param sql: the sql code to be executed
    :type sql: Can receive a str representing a sql statement,
        a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
    """

    template_fields = ('sql',)
    template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self, sql,
            postgres_conn_id='postgres_default', autocommit=False,
            *args, **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)

        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit

    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit)
示例#11
0
def create_import_table(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    commands = ("""
        CREATE TABLE IF NOT EXISTS trips (
            trip_id SERIAL8 PRIMARY KEY,
            city VARCHAR(16),
            duration_sec INT4,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            start_station_id FLOAT4,
            start_station_name VARCHAR(256),
            start_location_latitude FLOAT8,
            start_location_longitude FLOAT8,
            end_station_id FLOAT4,
            end_station_name VARCHAR(256),
            end_location_latitude FLOAT8,
            end_location_longitude FLOAT8,
            bike_id INT4,
            user_type VARCHAR(16),
            member_birth_year FLOAT4,
            member_gender VARCHAR(16)
            );
        """, """
        CREATE TABLE IF NOT EXISTS acs (
        fips VARCHAR(19) PRIMARY KEY,
        bach_edu FLOAT8,
        tot_trips FLOAT8,
        trips_u10 FLOAT8,
        trips_1014 FLOAT8,
        trips_1519 FLOAT8,
        med_age FLOAT8,
        med_age_m FLOAT8,
        med_age_f FLOAT8,
        pop_tot FLOAT8,
        race_white FLOAT8,
        race_black FLOAT8,
        asian FLOAT8,
        smartphone FLOAT8,
        car FLOAT8,
        transit FLOAT8,
        bike FLOAT8,
        walk FLOAT8,
        male_tot FLOAT8,
        fmale_tot FLOAT8,
        units_tot FLOAT8,
        owned FLOAT8,
        rented FLOAT8,
        med_hh_inc FLOAT8,
        med_rent FLOAT8
        );

        COPY acs FROM '/home/ubuntu/bikeiq/data/acs.csv' DELIMITER ',' CSV HEADER;
        
        ALTER TABLE acs ADD COLUMN bg_id VARCHAR(12);
        UPDATE acs SET bg_id=substring(fips, 8);
        """)
    for command in commands:
        pg_hook.run(command)
示例#12
0
def process_customers_order_dim(**kwargs):
    conn_id = kwargs.get('conn_id')
    pg_hook = PostgresHook(conn_id)
    sql = "select distinct customer.cust_name, order_info.order_id from customer, order_info where customer.customer_id=order_info.customer_id;"

    records = pg_hook.get_records(sql)
    print(records)

    return records
示例#13
0
def cache_latest_rates(ds, **kwargs):
    redis_conn = redis.StrictRedis(host='redis')
    pg_hook = PostgresHook(postgres_conn_id='rates')
    latest_rates = """SELECT DISTINCT ON (pair)
                             pair, rate
                      FROM   rates
                      ORDER  BY pair, valid_until DESC;"""

    for iso2, rate in pg_hook.get_records(latest_rates):
        redis_conn.set(iso2, rate)
示例#14
0
def cache_latest_stocks(ds, **kwargs):
    redis_conn = redis.StrictRedis(host='redis')
    pg_hook = PostgresHook(postgres_conn_id='stocks')
    latest_stocks = """SELECT DISTINCT ON (symbol)
                             symbol, price
                      FROM   stocks
                      ORDER  BY symbol, valid_until DESC;"""

    for iso2, stock in pg_hook.get_records(latest_stocks):
        redis_conn.set(iso2, stock)
示例#15
0
    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        if self.create_tunnel is True:
            self.create_ssh_tunnel()

        print("EXECUTING QUERY!")

        time.sleep(10)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
示例#16
0
def insert_table(conn_id, schema, src_table, dest_table):

    pg_hook = PostgresHook(postgres_conn_id=conn_id)
    select_cmd = """SELECT relname, n_live_tup from {} where schemaname = '{}'
    and relname != '{}';""".format(src_table, schema, dest_table)
    src_conn = pg_hook.get_conn()
    src_cursor = src_conn.cursor()
    src_cursor.execute(select_cmd)
    records = src_cursor.fetchall()
    dest_conn = pg_hook.get_conn()
    dest_cursor = dest_conn.cursor()
    execute_values(dest_cursor, "INSERT INTO {} VALUES %s".format(dest_table),
                   records)
    dest_conn.commit()
示例#17
0
def get_event_status(**kwargs):
    cur = PostgresHook('airflow_db').get_cursor()

    sql = """SELECT
              task_id,
              dag_id,
              to_Char(dttm, 'HH24:MI:SS')
             FROM public.log
             WHERE owner = 'airflow' 
                   AND event = 'failed'
                   AND dttm >= current_date - interval '5 minutes'"""

    cur.execute(sql)
    result = cur.fetchall()
    return result
 def execute(self, context):
     postgres_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     s3_hook = S3Hook(aws_conn_id=self.s3_conn_id)
     res = self.query_db(self.query, postgres_hook)
     res.seek(0)
     s3_hook.load_file_obj(res, key="egress/sources.airflow.csv", bucket_name="demo-bucket-temp-977338899", replace=True)
     return True
示例#19
0
def load_data(ds, **kwargs):
    """
	Processes the json data, checks the types and enters into the
	Postgres database.
	"""

    pg_hook = PostgresHook(postgres_conn_id='weather_id')

    file_name = str(datetime.now().date()) + '.json'
    tot_name = os.path.join(os.path.dirname(__file__), 'src/data', file_name)

    # open the json datafile and read it in
    with open(tot_name, 'r') as inputfile:
        doc = json.load(inputfile)

    # transform the data to the correct types and convert temp to celsius
    city = str(doc['name'])
    country = str(doc['sys']['country'])
    lat = float(doc['coord']['lat'])
    lon = float(doc['coord']['lon'])
    humid = float(doc['main']['humidity'])
    press = float(doc['main']['pressure'])
    min_temp = float(doc['main']['temp_min']) - 273.15
    max_temp = float(doc['main']['temp_max']) - 273.15
    temp = float(doc['main']['temp']) - 273.15
    weather = str(doc['weather'][0]['description'])
    todays_date = datetime.now().date()

    # check for nan's in the numeric values and then enter into the database
    valid_data = True
    for valid in np.isnan([lat, lon, humid, press, min_temp, max_temp, temp]):
        if valid is False:
            valid_data = False
            break

    row = (city, country, lat, lon, todays_date, humid, press, min_temp,
           max_temp, temp, weather)

    insert_cmd = """INSERT INTO weather_table 
					(city, country, latitude, longitude,
					todays_date, humidity, pressure, 
					min_temp, max_temp, temp, weather)
					VALUES
					(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""

    if valid_data is True:
        pg_hook.run(insert_cmd, parameters=row)
class PostgresOperator(BaseOperator):

    template_fields = ('sql', )
    template_ext = ('.sql', )
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 sql,
                 table="public.new",
                 postgres_conn_id='postgres_default',
                 database=None,
                 autocommit=False,
                 *args,
                 **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)

        # self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.table = table
        self.database = database
        self.sql = sql

    def execute(self, context):
        self.log.info('Executing: %s', self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit)
        for output in self.hook.conn.notices:
            self.log.info(output)
        logging.info('Executing: ' + self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit)
        records = self.hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
        if len(records) < 1 or len(records[0]) < 1:
            raise ValueError(
                f"Data quality check failed. {self.table} returned no results")
        num_records = records[0][0]
        if num_records < 1:
            raise ValueError(
                f"Data quality check failed. {self.table} contained 0 rows")
        self.log.info(
            f'Data quality on table {self.table} check passed with {records[0][0]} records'
        )
示例#21
0
def create_table(conn_id, dest_table):

    pg_hook = PostgresHook(postgres_conn_id=conn_id)
    delete_cmd = """
                DROP TABLE IF EXISTS {};
                 """.format(dest_table)
    create_cmd = """
                 CREATE table {} (
                 table_name VARCHAR(50),
                 row_count INTEGER
                 )
                 """.format(dest_table)
    dest_conn = pg_hook.get_conn()
    delete_cursor = dest_conn.cursor()
    delete_cursor.execute(delete_cmd)
    dest_cursor = dest_conn.cursor()
    dest_cursor.execute(create_cmd)
    dest_conn.commit()
示例#22
0
def api_json_to_db(**kwargs):
    base_dir = Variable.get("tfl_park_base_dir")

    json_files = [
        f for f in listdir(base_dir) if isfile(join(base_dir, f))
        if '.json' in f
    ]

    db_tuples = []

    for current_file in json_files:
        time_pattern = re.compile('[0-9]*')
        pattern_match = time_pattern.match(current_file)

        if pattern_match is not None:
            timestamp = pattern_match.group()

            with open(join(base_dir, current_file), 'r') as f:
                car_parks = json.loads(f.read())

            for car_park in car_parks:
                park_id = car_park['id']
                park_name = car_park['name']

                for bay in car_park['bays']:
                    park_bay_type = bay['bayType']
                    park_bay_occupied = bay['occupied']
                    park_bay_free = bay['free']
                    park_bay_count = bay['bayCount']

                    db_tuples.append(
                        (timestamp, park_id, park_name, park_bay_type,
                         park_bay_occupied, park_bay_free, park_bay_count))

    target_fields = [
        't_stamp', 'park_id', 'park_name', 'park_bay_type',
        'park_bay_occupied', 'park_bay_free', 'park_bay_count'
    ]

    db_hook = PostgresHook(postgres_conn_id='tfl_db')
    db_hook.insert_rows('tfl_data', db_tuples, target_fields)

    for current_file in json_files:
        remove(join(base_dir, current_file))
示例#23
0
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        conn = self.hook.get_conn()
        log.info("Connected with " + self.redshift_conn_id)

        table = self.table
        s3_path = self.s3_path
        s3_role = self.s3_role
        s3_region = self.s3_region
        delimiter = self.delimiter
        encoding = self.encoding

        copy_statement = f"""
        COPY {table}
        FROM '{s3_path}'
        IAM_ROLE '{s3_role}'
        REGION '{s3_region}'
        DELIMITER '{delimiter}'
        ENCODGIN '{encoding}'
        """
        cursor = conn.cursor()
        cursor.execute(copy_statement)
        cursor.close()

        load_datetime = self.load_datetime
        record_source = self.record_source

        log.info("Loaded completed with load_datetime " + load_datetime)

        meta_statement = f"""
        UPDATE {table}
        SET LOAD_DATETIME = '{load_datetime}'',
        RECORD_SOURCE = '{record_source}''
        WHERE LOAD_DATETIME IS NULL 
        AND RECORD_SOURCE IS NULL
        """
        cursor = conn.cursor()
        cursor.execute(meta_statement)
        cursor.close()
        log.info("Metadata injection completed with load_datetime " +
                 load_datetime)

        conn.commit()
示例#24
0
def populate_target(**kwargs):

    #Pull data from staging
    staging_hook = PostgresHook('snql_staging')
    #Pull query from sql directory
    query = kwargs['templates_dict']['query']
    #Execute query
    staging_results = staging_hook.get_records(query)

    #Initialize hook to snql
    snql_hook = PostgresHook('snql')

    #Delete current rows in target table table
    snql_hook.run("""DELETE FROM dim_sneakers""")

    #Insert new rows into target table
    snql_hook.insert_rows('dim_sneakers', staging_results)
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                              schema=self.database)
     self.hook.run(self.sql, self.autocommit)
     for output in self.hook.conn.notices:
         self.log.info(output)
     logging.info('Executing: ' + self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     self.hook.run(self.sql, self.autocommit)
     records = self.hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
     if len(records) < 1 or len(records[0]) < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} returned no results")
     num_records = records[0][0]
     if num_records < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} contained 0 rows")
     self.log.info(
         f'Data quality on table {self.table} check passed with {records[0][0]} records'
     )
示例#26
0
def create_staging():

    create_sql = """CREATE TABLE IF NOT EXISTS dim_sneakers(
                sneaker_id INT,
                sneaker_name VARCHAR,
                color VARCHAR,
                created_at TIMESTAMP,
                is_owned BOOLEAN,
                sold_at TIMESTAMP,
                trashed_at TIMESTAMP,
                given_at TIMESTAMP,
                manufacturer_name VARCHAR,
                collaborator_name VARCHAR,
                count_wears INT,
                count_cleans INT,
                count_walks INT,
                updated_at TIMESTAMP);"""

    #is_current BOOLEAN

    #Initialize staging hook
    staging_hook = PostgresHook('snql_staging')

    #Run create table query
    staging_hook.run(create_sql)

    #Run delete records query
    staging_hook.run("""DELETE FROM dim_sneakers""")
示例#27
0
class SSHPostgresOperator(PostgresOperator):
    """

    """
    def __init__(self,
                 sql,
                 postgres_conn_id='postgres_default',
                 ssh_conn_id='ssh_default',
                 autocommit=False,
                 parameters=None,
                 database=None,
                 create_tunnel=True,
                 *args,
                 **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)
        self.sql = sql
        self.ssh_conn_id = ssh_conn_id
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database
        self.create_tunnel = create_tunnel

    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        if self.create_tunnel is True:
            self.create_ssh_tunnel()

        print("EXECUTING QUERY!")

        time.sleep(10)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)

    def create_ssh_tunnel(self):
        print("CREATING TUNNEL")
        if self.create_tunnel is True:
            tunnel = AstroSSHHook(ssh_conn_id=self.ssh_conn_id).create_tunnel()
示例#28
0
def pull_and_insert(**kwargs):

    #Pull query from sql directory
    query = kwargs['templates_dict']['query']

    #Initialize snql hook and pull data
    snql_hook = PostgresHook('snql')
    results = snql_hook.get_records(query)

    #Initialize staging hook and insert data to staging table
    staging_hook = PostgresHook('snql_staging')
    staging_hook.insert_rows('dim_sneakers', results)
示例#29
0
def monthly_spent(TaskName , ds, prev_ds, **kwargs  ):
    facebook = Variable.get("facebook_auth", deserialize_json=True)
    access_token =  facebook["access_token"]
    ad_account_id =  facebook["ad_account_id"]  
    FacebookAdsApi.init(access_token=access_token)

    fromDate = prev_ds
    toDate = (datetime.strptime(ds, '%Y-%m-%d') + timedelta(days=-1)).strftime('%Y-%m-%d') 

    fields = [ 'spend', 'account_id', ]
    params = { 'level': 'account', 'time_range': {'since': fromDate ,'until': toDate },}
    spendrows = AdAccount(ad_account_id).get_insights(    fields=fields,    params=params,)

    total = 0.0 
    for row in spendrows:
        total = total + float ( row["spend"])
    row = (total, ds)

    connDB =  PostgresHook (postgres_conn_id = 'airflow')
    FB_json_conn = BaseHook.get_connection('FB_json_conn').extra_dejson
    scommand = "insert into monthly_cost values( %s ,%s) "
    connDB.run(scommand, parameters =row)
    print ( "done, Cost from "+ prev_ds+ " to " + ds )
示例#30
0
def get_stocks(ds, **context):
    symbol = context['params']['symbol']

    pg_hook = PostgresHook(postgres_conn_id='stocks')
    api_hook = HttpHook(http_conn_id='alphavantage', method='GET')

    # If either of these raises an exception then we'll be notified via
    # Airflow
    resp = api_hook.run(f'query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey=537201H9R203WT4C&datatype=csv')
    resp = json.loads(resp.content)

    # These are the only valid stocks the DB supports at the moment. Anything
    # else that turns up will be ignored.

    stocks_insert = """INSERT INTO stocks (symbol, valid_until, price)
                      VALUES (%s, %s, %s);"""

    # If this raises an exception then we'll be notified via Airflow
    valid_until = datetime.fromtimestamp(resp['timestamp'])

    for iso2, price in resp['stocks'].items():
        # If converting the price to a float fails for whatever reason then
        # just move on.
        try:
            price = float(price)
        except:
            continue

        iso2 = iso2.upper().strip()

        if iso2 not in stocks or price < 0:
            continue

        pg_hook.run(stocks_insert, parameters=(iso2,
                                               valid_until,
                                               price))
示例#31
0
def get_news_articles(**kwargs):
    '''
    Python callable for article scraping task.
    This will get executed after news metadata has been
    collected and validated for the current execution period.

    Target URLs will be retrieved from Postgres for articles
    published after the last run date of this task. Articles
    will be written to file.
    '''
    # TODO:
    # Decide how to store articles in FS (method and schema)
    # Decide how to name log files
    # Decide whether to implement compression task

    pg_hook = PostgresHook(postgres_conn_id='sumzero')
    ds = kwargs.get('ds') or datetime.now().isoformat()
    '''
    query_results = pg_hook.get_records(

        SELECT URL FROM news_meta
            WHERE published_on > %s;
        ,
        parameters=[ds]
    )
    urls = [record[0] for record in query_results]
    '''
    urls = kwargs.get('urls')

    # Download articles and write them to a csv.
    time_str = datetime.now().isoformat()
    log_fname = 'test_{}.csv'.format(time_str)
    failed_log_fname = 'failed_test_{}.csv'.format(time_str)

    log_file = open(log_fname, 'w+')
    log_writer = csv.writer(log_file, delimiter=',', quotechar='|')
    failed_log = open(failed_log_fname, 'w+')
    failed_writer = csv.writer(failed_log, delimiter=',', quotechar='|')

    for url in urls:
        loader = Article(url)
        try:
            loader.download()
            loader.parse()
            log_writer.writerow((time_str, loader.text))
        except Exception as error:
            failed_writer.writerow((time_str, str(error), url))
示例#32
0
def redshift_call(sql, type_='run'):
    pghook = PostgresHook(postgres_conn_id="naveen_redshift")
    if type_ == 'run':
        return pghook.run(sql)
    else:
        return pghook.get_records(sql)
示例#33
0
 def execute(self, context):
     logging.info('Executing: ' + str(self.sql))
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
示例#34
0
 def execute(self, context):
     logging.info('Executing: ' + self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     self.hook.run(self.sql, self.autocommit)
示例#35
0
def postgres_call(sql):
    pghook = PostgresHook(postgres_conn_id="naveen_ngrok_postgres")
    return pghook.get_records(sql)