Пример #1
0
    def execute(self, context):
        postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

        credentials = s3_hook.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)
        s3_key = '{}/{}_'.format(
            self.s3_key,
            self.table) if self.table_as_file_name else self.s3_key
        select_query = "SELECT * FROM {schema}.{table}".format(
            schema=self.schema, table=self.table)
        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               s3_bucket=self.s3_bucket,
                               s3_key=s3_key,
                               access_key=credentials.access_key,
                               secret_key=credentials.secret_key,
                               unload_options=unload_options)

        self.log.info('Executing UNLOAD command...')
        postgres_hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
    def execute(self, context):
        """
        Data Quality Checks:
        1. Check the target table has a positive number of rows
        2. Check the target table has no duplicate primary key
        Args:
            context:

        Returns:
            None
        """
        hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        qf_row_count = self.q_row_count.format(schema=self.schema,
                                               table=self.table)
        self.log.info('Starting Data Quality Checks')
        # Test for presence of any records
        records = hook.get_records(qf_row_count)
        if any([len(records) < 1, len(records[0]) < 1, records[0][0] < 1]):
            self.log.error("{} returned no lines".format(self.table))
            raise ValueError("{} returned no lines".format(self.table))
        del records
        qf_dupes = self.q_dupes.format(schema=self.schema,
                                       table=self.table,
                                       pkey=self.pkey)
        # Test for no duplicates
        records = hook.get_records(qf_dupes)
        if records[0][0] > 1:
            self.log.error("{} returned  duplicates".format(self.table))
            raise ValueError("{} returned duplicates".format(self.table))
        self.log.info("Data Quality checked passed on {}".format(self.table))
        pass
Пример #3
0
 def __datasource_to_csv(self, execution_date):
     final_query = self.extract_query.\
         replace("$EXECUTION_DATE", """'%s'""" % execution_date)
     logging.info("QUERY : %s" % final_query)
     cursor = PostgresHook(self.connection).get_conn().cursor()
     cursor.execute(final_query)
     result = cursor.fetchall()
     # Write to CSV file
     temp_path = self.file_path + \
         self.table_name + \
         '_' + execution_date + '.csv'
     with open(temp_path, 'w') as fp:
         a = csv.writer(fp, quoting=csv.QUOTE_MINIMAL, delimiter='|')
         a.writerow([i[0] for i in cursor.description])
         a.writerows(result)
     # Read CSV file
     full_path = temp_path + '.gz'
     with open(temp_path, 'rb') as f:
         data = f.read()
     # Compress CSV file
     with gzip.open(full_path, 'wb') as output:
         try:
             output.write(data)
         finally:
             output.close()
     # Close file after reading
     f.close()
     # Delete csv file
     os.remove(temp_path)
     # Change access mode
     os.chmod(full_path, 0o777)
Пример #4
0
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                              schema=self.database)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
     for output in self.hook.conn.notices:
         self.log.info(output)
Пример #5
0
    def execute(self, context):
        self._postgres_hook = PostgresHook(
            postgres_conn_id=self.redshift_conn_id)
        self._s3_hook = S3Hook(aws_conn_id=self.aws_conn_id,
                               verify=self.verify)
        credentials = self._s3_hook.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self._postgres_hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
Пример #6
0
 def execute(self, context: 'Context'):
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                              schema=self.database)
     if self.runtime_parameters:
         final_sql = []
         sql_param = {}
         for param in self.runtime_parameters:
             set_param_sql = f"SET {{}} TO %({param})s;"
             dynamic_sql = SQL(set_param_sql).format(Identifier(f"{param}"))
             final_sql.append(dynamic_sql)
         for param, val in self.runtime_parameters.items():
             sql_param.update({f"{param}": f"{val}"})
         if self.parameters:
             sql_param.update(self.parameters)
         if isinstance(self.sql, str):
             final_sql.append(SQL(self.sql))
         else:
             final_sql.extend(list(map(SQL, self.sql)))
         self.hook.run(final_sql, self.autocommit, parameters=sql_param)
     else:
         self.hook.run(self.sql,
                       self.autocommit,
                       parameters=self.parameters)
     for output in self.hook.conn.notices:
         self.log.info(output)
Пример #7
0
    def execute(self, context, testing=False):
        """
        Query data from Postgresql master database and
        then write into a json file.

        The default json file name is the table name + the utc time.
        """
        self.log.info('LoadToJsonOperator Starting...')
        try:
            self.log.info("Initializing Postgres Master DB Connection...")
            psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id)
            psql_conn = psql_hook.get_conn()
            psql_cursor = psql_conn.cursor(cursor_factory=RealDictCursor)
            execution_date = context.get('execution_date')
            execution_date = execution_date.strftime(self._date_format)
            for table in self._tables:
                ports_select_all = self._query.format(table=table)
                psql_cursor.execute(ports_select_all)
                if not self._filenames:
                    self._filenames[table] = f"{table}_{execution_date}.json"
                self.save_to_json(cursor=psql_cursor,
                                  key=table,
                                  file_name=self._filenames[table])
        except (UndefinedTable, OperationalError):
            self.log.error("LoadToJsonOperator FAILED.")
            raise Exception("LoadToJsonOperator FAILED. OperationalError")
        except Exception:
            self.log.error(traceback.format_exc())
            raise Exception("LoadToJsonOperator FAILED.")
        finally:
            if not testing:
                self.log.info('Closing database connections...')
                psql_conn.close()
        self.log.info('LoadToJsonOperator SUCCESS!')
Пример #8
0
 def grower_export_wrap(ds, **kwargs):
     from lib.utils import print_time
     db = PostgresHook(postgres_conn_id=postgresConnId)
     conn = db.get_conn()
     try:
         date = datetime.now().strftime("%Y-%m-%d")
         CKAN_DOMAIN = Variable.get("CKAN_DOMAIN")
         # check if CKAN_DOMAIN exists
         assert CKAN_DOMAIN
         CKAN_DATASET_NAME_GROWER_DATA = Variable.get(
             "CKAN_DATASET_NAME_GROWER_DATA")
         assert CKAN_DATASET_NAME_GROWER_DATA
         CKAN_API_KEY = Variable.get("CKAN_API_KEY")
         assert CKAN_API_KEY
         ckan_config = {
             "CKAN_DOMAIN": CKAN_DOMAIN,
             "CKAN_DATASET_NAME_GROWER_DATA": CKAN_DATASET_NAME_GROWER_DATA,
             "CKAN_API_KEY": CKAN_API_KEY,
         }
         print("ckan_config:", ckan_config)
         grower_export(conn, date, 178, ckan_config)
         return 0
     except Exception as e:
         print("get error when export:", e)
         raise ValueError('Error executing query')
    def execute(self, context):
        aws_hook = AwsHook(self.aws_credentials_id, client_type="redshift")
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        s3_path = "s3://{}/{}/{}".format(self.s3_bucket, self.s3_directory,
                                         self.table["s3"]["key"])
        files_format = self.table["s3"]["format"]
        delimiter = self.table["s3"]["delimiter"]
        ignoreheader = self.table["s3"]["ignoreheader"]
        delimiter_text = ''

        if (delimiter):
            delimiter_text = f"delimiter '{delimiter}'"

        self.log.info(
            f"Start Copying data from {s3_path} to Table { self.table['name']}"
        )

        redshift.run(f"""
        COPY { self.table["name"]}
        FROM '{s3_path}'
        ACCESS_KEY_ID '{credentials.access_key}'
        SECRET_ACCESS_KEY '{credentials.secret_key}'
        {files_format}
        {delimiter_text}
        {ignoreheader}
        """)
Пример #10
0
def process_song_file(filepath):
    """
    Reads from song datasource and loads data into song and artist tables
    :param filepath: Source data location
    """

    postgres_hook = PostgresHook(postgres_conn_id='local_postgres',
                                 schema='postgres')
    conn = postgres_hook.get_conn()
    cur = conn.cursor()

    # open song file
    df = pd.read_json(filepath, lines=True)

    # insert song record
    song_data = df[['song_id', 'title', 'artist_id', 'year',
                    'duration']].values[0].tolist()
    cur.execute(song_table_insert, song_data)

    # insert artist record
    artist_data = df[[
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]].values[0].tolist()
    cur.execute(artist_table_insert, artist_data)

    cur.close()
    conn.commit()
Пример #11
0
 def tearDownClass(cls):
     postgres = PostgresHook()
     with postgres.get_conn() as conn:
         with conn.cursor() as cur:
             for table in TABLES:
                 cur.execute(
                     "DROP TABLE IF EXISTS {} CASCADE;".format(table))
Пример #12
0
 def query(self):
     """Queries Postgres and returns a cursor to the results."""
     hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.execute(self.sql, self.parameters)
     return cursor
Пример #13
0
 def test_get_conn_from_connection_with_schema(self, mock_connect):
     conn = Connection(login='******', password='******', host='host', schema='schema')
     hook = PostgresHook(connection=conn, schema='schema-override')
     hook.get_conn()
     mock_connect.assert_called_once_with(
         user='******', password='******', host='host', dbname='schema-override', port=None
     )
    def execute(self, context):
        self.log.info('Connecting to redshift!')
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for table in self.tables:
            records = redshift.get_records(f"SELECT COUNT(*) FROM {table}")
            if len(records) < 1 or len(records[0]) < 1:
                raise ValueError(
                    f"Data quality check failed. {table} returned no results")
            num_records = records[0][0]
            if num_records < 1:
                raise ValueError(
                    f"Data quality check failed. {table} contained 0 rows")

            for col in self.columns[table]:
                records = redshift.get_records(
                    f"SELECT COUNT(*) FROM {table} WHERE {col} IS NULL")
                num_records = records[0][0]
                if num_records > 0:
                    raise ValueError(
                        f"The column {col} in table {table} had a NULL value!")

            self.log.info(
                f"Data quality on table {table} check passed with {num_records} records"
            )
Пример #15
0
    def execute(self, context) -> None:
        postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = s3_hook.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_statement = f"""
            COPY {self.schema}.{self.table}
            FROM 's3://{self.s3_bucket}/{self.s3_key}'
            with credentials
            'aws_access_key_id={credentials.access_key};aws_secret_access_key={credentials.secret_key}'
            {copy_options};
        """

        if self.truncate_table:
            truncate_statement = f'TRUNCATE TABLE {self.schema}.{self.table};'
            sql = f"""
            BEGIN;
            {truncate_statement}
            {copy_statement}
            COMMIT
            """
        else:
            sql = copy_statement

        self.log.info('Executing COPY command...')
        postgres_hook.run(sql, self.autocommit)
        self.log.info("COPY command complete...")
 def execute(self, **kwargs) -> None:
     """Execute the operator
     """
     hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     hook.bulk_load(
         table=self.target_table,
         tmp_file=self.source_file,
     )
Пример #17
0
def check_if_table_exists(table_name, connection):
    hook = PostgresHook(postgres_conn_id=connection)
    query = hook.get_first(f'''SELECT * from information_schema.tables
                               where table_name = \'{table_name}\' and table_schema = \'public\'''')
    if query:
        return 'insert_new_row'
    else:
        return 'create_table'
Пример #18
0
 def check_table_exist(sql_to_get_schema, sql_to_check_table_exist, table_name):
     """ callable function to check if table exist """ 
     hook = PostgresHook()
     query = hook.get_first(sql=sql_to_check_table_exist.format(table_name.lower()))
 
     if query:
         return "insert_row"
     else:
         return "create_table"
Пример #19
0
 def insert_row(sql_query, table_name, custom_id, dt_now, **kwargs):
     """ postgres hook to insert a new row: | id | user | timestamp | """
     hook = PostgresHook()
     connection = hook.get_conn()
     cursor = connection.cursor()
     cursor.execute(
         sql_query, (custom_id, kwargs["ti"].xcom_pull(task_ids="getting_current_user"), dt_now)
     )
     connection.commit()
Пример #20
0
 def execute(self, context):
     hook = PostgresHook()
     connection = hook.get_conn()
     cursor = connection.cursor()    
     cursor.execute(f"SELECT COUNT(*) FROM {self.table_name.lower()};")
     count_r = cursor.fetchall()
     context["ti"].xcom_push(
         key="{db_name}_rows_count".format(db_name=self.table_name), value=count_r,
     )
    def create_new_person_records(ds, **kwargs):
        db = PostgresHook(postgres_conn_id=postgresConnId)
        conn = db.get_conn()  
        cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
        try:
            cursor.execute("""
              SELECT *
              FROM planter
              JOIN
              (
                SELECT regexp_replace((trim(lower(first_name)) ||  trim(lower(last_name))), '[ .-]', '', 'g') as name_key, count(*)
                FROM planter
                WHERE
                planter.organization_id IN (
                  select entity_id from getEntityRelationshipChildren(178)
                )
                GROUP BY name_key
                HAVING count(*) = 1
                ORDER BY name_key
              ) eligible_records
              ON regexp_replace((trim(lower(first_name)) ||  trim(lower(last_name))), '[ .-]', '', 'g') = eligible_records.name_key
              WHERE planter.organization_id IN (
                select entity_id from getEntityRelationshipChildren(178)
              )
              AND person_id IS NULL
            """);
            print("SQL result:", cursor.query)
            for row in cursor:
                #do something with every single row here
                #optionally print the row
                print(row)

                updateCursor = conn.cursor()
                updateCursor.execute("""
                  INSERT INTO entity
                  (type, first_name, last_name, email, phone)
                  values
                  ('p', %s, %s, %s, %s)
                  RETURNING *
                """, ( row['first_name'], row['last_name'], row['email'], row['phone'] ) );

                personId = updateCursor.fetchone()[0];
                print(personId)
                updateCursor.execute("""
                  UPDATE planter
                  SET person_id = %s
                  WHERE id = %s
                """, (personId, row['id']) )

            conn.commit()
            return 0
        except Exception as e:
            print("get error when exec SQL:", e)
            print("SQL result:", updateCursor.query)
            raise ValueError('Error executing query')
            return 1
Пример #22
0
 def execute(self, context):
     es = ElasticHook(conn_id=self.elastic_conn_id)
     pg = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     with closing(pg.get_conn()) as conn:
         with closing(conn.cursor()) as cursor:
             cursor.itersize = 1000
             cursor.execute(self.sql)
             for row in cursor:
                 doc = json.dumps(row, indent=2)
                 es.add_doc(index=self.index, doc_type='external', doc=doc)
Пример #23
0
 def query(self):
     """Queries Postgres and returns a cursor to the results."""
     hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     conn = hook.get_conn()
     cursor = conn.cursor(name=self._unique_name())
     cursor.execute(self.sql, self.parameters)
     if self.use_server_side_cursor:
         cursor.itersize = self.cursor_itersize
         return _PostgresServerSideCursorDecorator(cursor)
     return cursor
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database)
     result = self.hook.get_first(self.sql, parameters=self.parameters)
     if not result:
         raise AirflowException("The query returned None")
     record = result[0]
     self.log.info('First record: {0}'.format(record))
     for output in self.hook.conn.notices:
         self.log.info(output)
     return record
Пример #25
0
    def test_rowcount(self):
        hook = PostgresHook()
        input_data = ["foo", "bar", "baz"]

        with hook.get_conn() as conn:
            with conn.cursor() as cur:
                cur.execute(f"CREATE TABLE {self.table} (c VARCHAR)")
                values = ",".join(f"('{data}')" for data in input_data)
                cur.execute(f"INSERT INTO {self.table} VALUES {values}")
                conn.commit()
                assert cur.rowcount == len(input_data)
    def execute(self, context):
        self.log.info('Start DataQualityOperator')
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        error_messages = "Data quality check failed"
        
        for table_dict in self.tables_list:
            if 'data_quality' in table_dict:
                table = table_dict['name']
                task_name = f"Check for records in table {table}"
                min_records = 0
                records_count = 0

                if 'minimum_records' in table_dict['data_quality']:
                    min_records = table_dict['data_quality']['minimum_records']

                self.log.info(task_name)

                records = redshift.get_records(f"SELECT COUNT(*) FROM {table}")

                if len(records) >= 1 and len(records[0]) >= 1 :
                    records_count = records[0][0]
                    if records_count < min_records:
                        raise ValueError(
                            f"""
                            {error_messages}
                            for {task_name}
                            found ${records_count},
                            Expected a minimum of {min_records} records.
                            """
                            )
                else:
                    raise ValueError(f"{error_messages}. No result for {task_name}")
                
                if 'not_null_columns' in table_dict['data_quality'] and records_count > 0:
                    not_null_columns = table_dict['data_quality']['not_null_columns']

                    for column in not_null_columns:
                        check_null_task_name = f"Check for null values in {table}.{column}"
                        self.log.info(check_null_task_name)
                        count_nulls = redshift.get_records(f"SELECT COUNT(*) FROM {table} WHERE {column} is null")
                        
                        if len(count_nulls) >= 1 and len(count_nulls[0]) >= 1 :
                            null_values = count_nulls[0][0]
                            if null_values > 0:
                                raise ValueError(
                                    f"""
                                    {error_messages}
                                    for {check_null_task_name},
                                    Found {null_values} null records in {table}.{column}.
                                    """
                                    )
                        else:
                            raise ValueError(f"{error_messages}. No result for {check_null_task_name}")
Пример #27
0
    def test_rowcount(self):
        hook = PostgresHook()
        input_data = ["foo", "bar", "baz"]

        with hook.get_conn() as conn:
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table))
                values = ",".join("('{}')".format(data) for data in input_data)
                cur.execute("INSERT INTO {} VALUES {}".format(
                    self.table, values))
                conn.commit()
                self.assertEqual(cur.rowcount, len(input_data))
Пример #28
0
def process_log_file(filepath):
    """
    Reads from log datasource and loads data into time, user and songplays tables
    :param filepath: Source data location
    """
    
    postgres_hook = PostgresHook(postgres_conn_id='local_postgres', schema='postgres')
    conn = postgres_hook.get_conn()
    cur = conn.cursor()

    # open log file
    df = pd.read_json(filepath, lines=True)

    # filter by NextSong action
    df = df[df['page'] == 'NextSong']

    # convert timestamp column to datetime
    t = pd.to_datetime(df['ts'], unit='ms')
    
    # insert time data records
    time_data = [t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday]
    column_labels = ['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']
    time_df = pd.DataFrame.from_dict(dict(zip(column_labels, time_data)))

    for i, row in time_df.iterrows():
        cur.execute(time_table_insert, list(row))

    # load user table
    user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']]

    # insert user records
    for i, row in user_df.iterrows():
        cur.execute(user_table_insert, row)

    # insert songplay records
    for index, row in df.iterrows():
        
        # get songid and artistid from song and artist tables
        cur.execute(song_select, (row.song, row.artist, row.length))
        results = cur.fetchone()
        
        if results:
            songid, artistid = results
        else:
            songid, artistid = None, None

        # insert songplay record
        songplay_data = (pd.to_datetime(row['ts'], unit='ms'), row['userId'], row['level'], songid, artistid, row['sessionId'], 
                     row['location'], row['userAgent'])
        cur.execute(songplay_table_insert, songplay_data)

    cur.close()
    conn.commit()
    def execute(self, context):
        """
        Description: This custom function implements one or more data quality checks that are passed as
                     SQL commands in the data_quality_checks list, executes them and checks the
                     return value for correctness. If everything fits, this function works without any problems.
                     If there is a disagreement, an error is thrown.

        Arguments:
            self: Instance of the class
            context: Context dictionary

        Returns:
            None
        """

        # Build connection
        postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        # If no quality checks were specified, the function is terminated
        if len(self.data_quality_checks) <= 0:
            self.log.info(
                'No data quality checks were specified. Data quality checks canceled.'
            )
            return

        # Here every single quality check is run through, the associated SQL command is executed and the return value is checked.
        for check in self.data_quality_checks:
            sql_query = check.get('sql_query')
            expected_result = check.get('expected_result')

            try:
                self.log.info(
                    'Starting SQL query for data check - {}'.format(sql_query))
                records = postgres.get_records(sql_query)
                num_records = records[0][0]

                if num_records != expected_result:
                    raise ValueError(
                        'Data quality check failed. {} entries excpected. {} given'
                        .format(expected_result, num_records))
                else:
                    self.log.info(
                        'Data Check passed for query - {}. Result: {}'.format(
                            sql_query, num_records))

            except ValueError as v:
                self.log.info(v.args)
                raise
            except Exception as e:
                self.log.info(
                    'SQL query for data check failed - {}. Exception: {}'.
                    format(sql_query, e))
                raise
Пример #30
0
 def ingest_file_into_db(self, request_id):
     pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                            schema=self.database)
     with closing(pg_hook.get_conn()) as pg_conn:
         with closing(pg_conn.cursor()) as pg_cursor:
             csv_file = self.unzip_csv(
                 self.get_file_data(request_id, pg_cursor))
             rowcount = self.upload_local_file_into_table(
                 csv_file, pg_cursor, table.weather_daily)
             csv_file.close()
         pg_conn.commit()
     return rowcount