def test_snowflake_operator(self, mock_get_hook):
     sql = """
     CREATE TABLE IF NOT EXISTS test_airflow (
         dummy VARCHAR(50)
     );
     """
     t = SnowflakeOperator(task_id='basic_snowflake', sql=sql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           ignore_ti_state=True)
 def test_snowflake_operator(self, mock_get_hook):
     sql = """
     CREATE TABLE IF NOT EXISTS test_airflow (
         dummy VARCHAR(50)
     );
     """
     t = SnowflakeOperator(
         task_id='basic_snowflake',
         sql=sql,
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
           ignore_ti_state=True)
Пример #3
0
        def upload_to_snowflake(task_id):
            sql_statements = []

            # look for CSVs to ignest to snowflake
            for output_file in glob.glob(output_file_glob + ".csv"):
                s3_file_name = os.path.basename(output_file)
                tablename = os.path.splitext(s3_file_name)[0].replace("-", "_")
                snowflake_stage = Variable.get("SNOWFLAKE_STAGE",
                                               default_var="COVID_PROD")

                truncate_st = f'TRUNCATE TABLE {tablename}'
                insert_st = f'copy into {tablename} from @{snowflake_stage}/{s3_file_name} file_format = (type = "csv" field_delimiter = "," NULL_IF = (\'NULL\', \'null\',\'\') EMPTY_FIELD_AS_NULL = true FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)'
                sql_statements.append(truncate_st)
                sql_statements.append(insert_st)

            sql_statements.append("COMMIT")

            create_insert_task = SnowflakeOperator(
                task_id=task_id,
                sql=sql_statements,
                autocommit=False,
                snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION",
                                               default_var="SNOWFLAKE"),
            )

            return create_insert_task
Пример #4
0
    def load_raw_ingester_table(job_args, defaults, table, beid_list,
                                parent_dag_name, start_date, schedule_interval,
                                sf_conn_id, sf_role, sf_warehouse, sf_database,
                                env):
        dag = DAG("{}.{}".format(parent_dag_name, table),
                  default_args=defaults,
                  start_date=start_date,
                  schedule_interval=schedule_interval,
                  catchup=False,
                  concurrency=32)

        sql_path = os.path.join(job_args["ingester_sql_path"], table)

        for raw_beid in beid_list:
            beid = re.search('.*client=([0-9]+)\/', raw_beid).group(1)

            raw_ingester_query = SqlUtils.load_query(sql_path).replace(
                "[beid_param]", raw_beid).split("---")

            per_beid_task = SnowflakeOperator(task_id=beid,
                                              snowflake_conn_id=sf_conn_id,
                                              warehouse=sf_warehouse,
                                              database=sf_database,
                                              sql=raw_ingester_query,
                                              params={
                                                  "env": env,
                                                  "table": table
                                              },
                                              autocommit=True,
                                              dag=dag)

        return dag
Пример #5
0
 def execute_script(sql_file):
     script = env.get_template(sql_file).render(**template_params)
     sql_statements = script.split(";")
     return SnowflakeOperator(
         task_id="execute_script_" + sql_file,
         sql=sql_statements,
         autocommit=False,
         snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION",
                                        default_var="SNOWFLAKE"),
     )
Пример #6
0
        def upload_to_snowflake(task_id):

            insert_st = f'copy into {basename.replace("-","_")} from @COVID_PROD/{basename}.csv file_format = (type = "csv" field_delimiter = ","  FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)'

            create_insert_task = SnowflakeOperator(
                task_id=task_id,
                sql=insert_st,
                snowflake_conn_id="OJ10999_COVID19",
            )

            return create_insert_task
Пример #7
0
        def upload_to_snowflake(task_id):

            truncate_st = f'TRUNCATE TABLE {basename.replace("-", "_")}'
            insert_st = f'copy into {basename.replace("-","_")} from @COVID_PROD/{basename}.csv file_format = (type = "csv" field_delimiter = ","  FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)'

            create_insert_task = SnowflakeOperator(
                task_id=task_id,
                sql=[truncate_st, insert_st, "COMMIT"],
                autocommit=False,
                snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION",
                                               default_var="SNOWFLAKE"),
            )

            return create_insert_task
Пример #8
0
        def upload_to_snowflake(task_id):
            sql_statements = []

            for output_file in glob.glob(output_file_glob):
                s3_file_name = os.path.basename(output_file)
                tablename = os.path.splitext(s3_file_name)[0].replace("-", "_")

                truncate_st = f'TRUNCATE TABLE {tablename}'
                insert_st = f'copy into {tablename} from @COVID_PROD/{s3_file_name} file_format = (type = "csv" field_delimiter = ","  FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)'
                sql_statements.append(truncate_st)
                sql_statements.append(insert_st)

            sql_statements.append("COMMIT")

            create_insert_task = SnowflakeOperator(
                task_id=task_id,
                sql=sql_statements,
                autocommit=False,
                snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION",
                                               default_var="SNOWFLAKE"),
            )

            return create_insert_task
Пример #9
0
    'depends_on_past': False,
    'start_date': days_ago(7),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}
DAG_DESCRIPTION = 'Email discounts to customers that have experienced order delays daily'

DAG = dag = DAG(DAG_ID,
                schedule_interval='@weekly',
                default_args=DAG_DEFAULT_ARGS,
                description=DAG_DESCRIPTION)

TASK_ID = 'select'
TASK = SnowflakeOperator(task_id=TASK_ID,
                         snowflake_conn_id=CONN_ID,
                         sql=SQL,
                         dag=DAG)


@mock.patch(
    'openlineage.airflow.extractors.snowflake_extractor.SnowflakeExtractor._get_table_schemas'
)  # noqa
@mock.patch('openlineage.airflow.extractors.postgres_extractor.get_connection')
def test_extract(get_connection, mock_get_table_schemas):
    mock_get_table_schemas.side_effect = \
        [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA]

    conn = Connection()
    conn.parse_from_uri(uri=CONN_URI)
    get_connection.return_value = conn
Пример #10
0
from airflow import DAG
from datetime import datetime, timedelta

from airflow.contrib.operators.snowflake_operator import SnowflakeOperator

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2021, 5, 14),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=60),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('sr_etl2', default_args=default_args, schedule_interval='@once')

snowflake = SnowflakeOperator(
    task_id='snowflake_task',
    sql="select * from CURO_CLAIMS.curo_medclaims LIMIT 5",
    snowflake_conn_id='ocudm',
    dag=dag)
Пример #11
0
    body = """
    Hi Everyone, <br>
    <br>
    There's been an error in the {task_name} job.<br>
    <br>
    Forever yours,<br>
    Airflow bot <br>
    """.format(**contextDict)

    send_email_smtp('*****@*****.**', title, body)


with dag:
    create_table = SnowflakeOperator(
        task_id="snowflake_create",
        sql=query_for_creation,
        snowflake_conn_id="snowflake_airflow",
    )

with dag:
    load_table = SnowflakeOperator(task_id='snowflake_load',
                                   sql=query_for_loading,
                                   snowflake_conn_id="snowflake_airflow")
with dag:
    update_data = SnowflakeOperator(
        task_id='snowflake_update',
        sql=query_for_update,
        snowflake_conn_id="snowflake_airflow",
    )

with dag:
Пример #12
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'snowflake_analytics',
    default_args=default_args,
    description='Snowflake analytics pipeline',
    schedule_interval='@daily',
)


t1 = SnowflakeOperator(
    task_id='create_analytics_calendar_date',
    sql='sql/create_analytics_calendar_date.sql',
    snowflake_conn_id='snowflake_default',
    warehouse='load_wh',
    database='analytics',
    autocommit=True,
    dag=dag)

t2 = SnowflakeOperator(
    task_id='create_analytics_airline',
    sql='sql/create_analytics_airline.sql',
    snowflake_conn_id='snowflake_default',
    warehouse='load_wh',
    database='analytics',
    autocommit=True,
    dag=dag)

t3 = SnowflakeOperator(
    task_id='create_analytics_nyc_taxi',
Пример #13
0
)

stage_finish = DummyOperator(task_id="adlogs_snowflake_staging_finish")

# staging ad logs hourly
for table in JOB_ARGS["tables"]:

    stage_sql_path = os.path.join(
        JOB_ARGS["stage_sql_path"],
        table
        )

    query_log = SqlUtils.load_query(stage_sql_path).split("---")
    
    stage_adlogs_hourly_job = SnowflakeOperator(
        task_id="stage_logs_{}_hourly".format(table),
        snowflake_conn_id=SF_CONN_ID,
        warehouse=SF_WAREHOUSE,
        database=SF_DATABASE,
        sql=query_log,
        params={
            "env": ENV,
            "team_name": TEAM_NAME
        },
        autocommit=True,
        trigger_rule='all_done',
        dag=DAG
    )

    stage_adlogs_hourly_job >> stage_finish
Пример #14
0

def get_row_count(**context):
    dwh_hook = SnowflakeHook(snowflake_conn_id="snowflake_common")
    result = dwh_hook.get_first(
        "select count(*) from private.{}".format(table_name))
    logging.info("Number of rows in `private.%s`  - %s", table_name, result[0])


with dag:
    create_table = SnowflakeOperator(
        task_id="create_table",
        sql=create_table_queries,
        snowflake_conn_id="snowflake_common",
        outlets={
            "datasets": [
                SnowflakeTable(
                    table_alias="cy25812.ap-southeast-1/demo_db/private/{}".
                    format(table_name),
                    name=table_name)
            ]
        })

    count_rows = PythonOperator(
        task_id="count_rows",
        python_callable=get_row_count,
        inlets={
            "datasets": [
                SnowflakeTable(
                    table_alias="cy25812.ap-southeast-1/demo_db/private/{}".
                    format(table_name),
                    name=table_name)
Пример #15
0
    result = snowflake_hook.get_first(
        "SELECT COUNT(*) FROM @AZURE_DEMO_STAGE/superstore_sample_data.csv (FILE_FORMAT => CSV_FILE_FORMAT);"
    )
    logging.info("Number of rows in raw file  - %s", result[0])


#Using a with statement to execute cleanup code
with dag:

    #Using a snowflake operator to execute a select command on our raw data in Azure
    check_raw = PythonOperator(task_id="check_raw", python_callable=check_raw)

    #Using a snowflake operator to execute our create command in Snowflake
    create = SnowflakeOperator(
        task_id="snowflake_create",
        sql=create_query,
        snowflake_conn_id="snowflake_conn",
    )

    #Using a snowflake operator to execute our insert command in Snowflake
    insert = SnowflakeOperator(
        task_id="snowflake_insert",
        sql=insert_query,
        snowflake_conn_id="snowflake_conn",
    )

    #Using a python operator to execute a user-defined python function
    check_table = PythonOperator(task_id="check_table",
                                 python_callable=check_table)

#Using python bitwise shift operators to determine the workflow of our dag (the order in which our tasks will executed)
dag = DAG(
    dag_id="sample_snowflake", 
    default_args=default_args,
    start_date=days_ago(1),
    schedule_interval=None
)

status_sql = """
insert into public.stage_status values (101, '{}', 'create_file', 'success');
""".format(str(datetime.now()))

def row_count(**context):
    dwh_hook = SnowflakeHook(snowflake_conn_id="snowflake_sql")
    result = dwh_hook.get_first("select count(*) from public.stage_status")
    print("Number of rows in `public.test_table`  - %s", result[0])


with dag:
    create_insert = SnowflakeOperator(
        task_id="snowflake_insert",
        sql=status_sql,
        snowflake_conn_id="snowflake_sql",
    )

    get_count = PythonOperator(
        task_id="get_count", 
        python_callable=row_count
    )

create_insert >> get_count
Пример #17
0
from airflow.contrib.operators.snowflake_operator import SnowflakeOperator

from atlan.lineage.assets import SnowflakeTable

args = {"owner": "Atlan", "start_date": airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id="customer_distribution_apac", default_args=args, schedule_interval=None)

with dag:
    ## task definition
  customer_nation_join = SnowflakeOperator(
    task_id = "customer_nation_join",
    sql = """CREATE TABLE biw.private.customer_enriched AS
             SELECT c.c_custkey,
                c.c_acctbal,
                c.c_mktsegment,
                n.n_nationkey,
                n.n_name
             FROM biw.raw.customer c
             INNER JOIN biw.raw.nation n ON c.c_nationkey = n.n_nationkey""",
    snowflake_conn_id = "snowflake_common",
    inlets: {
      "datasets": [SnowflakeTable(table_alias = "mi04151.ap-south-1/biw/raw/customer", 
                                  name = "customer"), 
                   SnowflakeTable(table_alias = "mi04151.ap-south-1/biw/raw/nation", 
                                  name = "nation")]
    },
    outlets: {
      "datasets": [SnowflakeTable(table_alias = "mi04151.ap-south-1/biw/private/customer_enriched", 
                                  name = "customer_enriched")]
    }
  )
DAG = DAG(  #creating DAG
    "stage_ad_logs_to_snowflake",  #goal of DAG is to stage ad logs to Snowflake
    default_args=DEFAULTS,
    start_date=datetime(2018, 1, 1),
    schedule_interval=JOB_ARGS["schedule_interval"],
    catchup=False)

stage_int_sql_path = os.path.join(
    JOB_ARGS["stage_sql_path"],  #stage_sql_path = adlogs/load_raw_logs
    "int")
stage_int_hourly_query = SqlUtils.load_query(stage_int_sql_path).split("---")
stage_int_hourly_job = SnowflakeOperator(task_id="stage_adlogs_int_hourly",
                                         snowflake_conn_id=SF_CONN_ID,
                                         warehouse=SF_WAREHOUSE,
                                         database=SF_DATABASE,
                                         sql=stage_int_hourly_query,
                                         params={"env": ENV},
                                         autocommit=True,
                                         dag=DAG)

stage_int_tables = DummyOperator(task_id="finish_int_rl_staging")

stage_onetag_sql_path = os.path.join(JOB_ARGS["stage_sql_path"], "onetag")
stage_onetag_hourly_query = SqlUtils.load_query(stage_onetag_sql_path).split(
    "---")
stage_onetag_hourly_job = SnowflakeOperator(
    task_id="stage_adlogs_onetag_hourly",
    snowflake_conn_id=SF_CONN_ID,
    warehouse=SF_WAREHOUSE,
    database=SF_DATABASE,
    sql=stage_onetag_hourly_query,
Пример #19
0
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'snowflake_raw',
    default_args=default_args,
    description='Snowflake raw pipeline',
    schedule_interval='0 */6 * * *',
)

t1 = SnowflakeOperator(task_id='copy_raw_airline',
                       sql='sql/copy_raw_airline.sql',
                       snowflake_conn_id='snowflake_default',
                       warehouse='load_wh',
                       database='raw',
                       autocommit=True,
                       dag=dag)

t2 = SnowflakeOperator(task_id='copy_raw_nyc_taxi',
                       sql='sql/copy_raw_nyc_taxi.sql',
                       snowflake_conn_id='snowflake_default',
                       warehouse='load_wh',
                       database='raw',
                       autocommit=True,
                       dag=dag)

t1 >> t2
Пример #20
0
        sql_path = os.path.join(
            JOB_ARGS[
                "stage_sql_path"],  #stage_sql_path = adlogs/log_process/filename.sql
            process)
        sql_query = SqlUtils.load_query(sql_path).split(
            "---"
        )  # sql_query is a list of all the queries in a given .sql file, seperated by '---'
        query_log += sql_query

        #for i in range(len(sql_query)):
        #     query_log.append(sql_query[i]) # using .append() instead of +=

    #create task with appropriate sql query
    transform_task = SnowflakeOperator(
        task_id="{}_transform".format(table),
        snowflake_conn_id=SF_CONN_ID,
        warehouse=SF_WAREHOUSE,
        database=SF_DATABASE,
        sql=query_log,  # here is where we pass in the query log
        params={  #set strings to reference in the .sql scripts
            "env": ENV,
            "table": table,
            "database": SF_DATABASE,  #ft_db_dev
            "transform_schema": TRANSFORM_SCHEMA,  #transform_schema
            "business_schema": BUSINESS_SCHEMA  #ssc_business
        },
        autocommit=True,
        dag=DAG)

    adlogs_sensor >> transform_task >> transform_finish
                                        #op_kwargs = {"file_name" : file}
                                        )

load_to_s3 = PythonOperator(dag=dag,
                            task_id=f"load_to_s3",
                            python_callable=load_to_s3,
                            provide_context=True,
                            op_kwargs={"file_name": "contacts"})

s3_to_snowflake = PythonOperator(dag=dag,
                                 task_id=f"s3_to_snowflake",
                                 python_callable=s3_to_snowflake,
                                 provide_context=True,
                                 op_kwargs={"file_name": "contacts"})

Contacts_merge_snowflake = SnowflakeOperator(
    dag=dag,
    task_id=f"Contacts_merge_snowflake",
    sql=merge_query,
    snowflake_conn_id="snowflake_conn")

completed_flag = PythonOperator(dag=dag,
                                task_id=f"completed_flag",
                                python_callable=completed_flag,
                                provide_context=True
                                #op_kwargs = {"file_name" : file}
                                )

dummy = DummyOperator(task_id="dummy", dag=dag)

dummy >> get_latest_Created_contacts >> get_latest_Updated_contacts >> get_contact_properties >> load_to_s3 >> s3_to_snowflake >> Contacts_merge_snowflake >> completed_flag
Пример #22
0
dag = DAG(dag_id="snowflake_connector_test",
          default_args=args,
          schedule_interval='@once')

create_insert_query = [
    """create table public.test_table (amount number);""",
    """insert into public.test_table values(1),(2),(3);"""
]

#def row_count(**kwargs):
#    sleep(120)
#    dwh_hook = SnowflakeHook(snowflake_conn_id="ocudm")
#    result = dwh_hook.get_first("select count(*) from curo_medclaims")
#    print("hello")
#    sleep(120)
#    logging.info("Number of rows in `curo_medclaims`  - %s", result[0])

with dag:
    create_insert = SnowflakeOperator(
        task_id="snowfalke_create",
        sql=create_insert_query,
        snowflake_conn_id="ocudm",
    )
    #get_count = PythonOperator(task_id="get_count", python_callable=row_count,provide_context=True)
    sleep = BashOperator(task_id='sleep', bash_command='sleep 30')
    print_hello = BashOperator(task_id='print_hello',
                               bash_command='echo "hello"')

print_hello >> sleep >> create_insert
Пример #23
0
import logging
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.snowflake_hook import SnowflakeHook
from airflow.contrib.operators.snowflake_operator import SnowflakeOperator

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id="snowflake_conn", default_args=args, schedule_interval=None)

create_query = [
    """create table PUBLIC.TEST_TABLE;""",
]

with dag:
    create = SnowflakeOperator(
        task_id="snowflake_create",
        sql=create_query,
        snowflake_conn_id="snowflake_conn",
    )

# create_query
Пример #24
0
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('demo',
          max_active_runs=1,
          schedule_interval='@daily',
          default_args=default_args,
          catchup=False)

create_discounts_by_quarter = """
    create or replace table {{ var.value.database }}.{{ var.value.spa_schema }}.discounts_by_fiscal_quarter as
    select c_start.fiscal_quarter, c_start.fiscal_year, count(*) as discounts
        from {{ var.value.database }}.{{ var.value.ods_schema }}.discount d 
        join {{ var.value.database }}.{{ var.value.ods_schema }}.calendar c_start on d.start_date=c_start.date
    group by 1,2
    order by 2,1
"""

with dag:
    start = DummyOperator(task_id='start', dag=dag)

    discount_by_quarter = SnowflakeOperator(
        task_id='discount_by_quarter',
        snowflake_conn_id='snowflake_default',
        sql=create_discounts_by_quarter,
        warehouse='ANALYTICS_LARGE',
        dag=dag)

    start >> discount_by_quarter
Пример #25
0
from airflow import DAG
from airflow.contrib.operators.snowflake_operator import SnowflakeOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG('paramaterized_query',
         start_date=datetime(2020, 6, 1),
         max_active_runs=3,
         schedule_interval='@daily',
         default_args=default_args,
         template_searchpath='/usr/local/airflow/include',
         catchup=False) as dag:

    opr_param_query = SnowflakeOperator(task_id='param_query',
                                        snowflake_conn_id='snowflake',
                                        sql='param-query.sql')