Exemplo n.º 1
0
    def test_render_template(self):
        # Given
        operator = SparkSubmitOperator(task_id='spark_submit_job',
                                       dag=self.dag,
                                       **self._config)
        ti = TaskInstance(operator, DEFAULT_DATE)

        # When
        ti.render_templates()

        # Then
        expected_application_args = [
            '-f',
            'foo',
            '--bar',
            'bar',
            '--start',
            (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
            '--end',
            DEFAULT_DATE.strftime("%Y-%m-%d"),
            '--with-spaces',
            'args should keep embdedded spaces',
        ]
        expected_name = 'spark_submit_job'
        self.assertListEqual(expected_application_args,
                             getattr(operator, '_application_args'))
        self.assertEqual(expected_name, getattr(operator, '_name'))
Exemplo n.º 2
0
def cassandra_to_avro():
    # @task
    def load_from_cassandra() -> List[Tuple[str, str]]:
        conn: Connection = Connection.get_connection_from_secrets('local_cassandra')
        auth_provider = PlainTextAuthProvider(username=conn.login, password=conn.password)
        cluster: Cluster = Cluster([conn.host], conn.port, auth_provider=auth_provider)
        session: Session = cluster.connect(conn.schema)
        rows: ResultSet = session.execute("SELECT title, description FROM videos")
        result = list(map(lambda row: (row[0], row[1]), rows))
        print(result)
        return result
    
    # @task
    def write_to_hdfs(rows: List[Tuple[str, str]]):
        conn: Connection = Connection.get_connection_from_secrets('local_hdfs')
        uri = conn.get_uri()
        pat = re.compile("http://(\w+(:\w+)?)?@")
        print(conn.get_uri())

        uri = pat.sub("http://", uri)
        print(uri)
        print(conn.login)
        client = InsecureClient(uri, user=conn.login)
        sch = avro.schema.make_avsc_object({
            'type':'record',
            'name':'Video',
            'fields': [
                {'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'title'},
                {'type': ["null", {'type': 'string', 'avro.java.string': 'String'}], 'name': 'description'},
            ]
        })
        local_file_name = 'videos.avro'
        writer = DataFileWriter(open(local_file_name, "wb"), DatumWriter(), sch)
        for row in rows:
            print(row)
            writer.append({"title":row[0], "description":row[1]})
        writer.close()
        client.upload('/tmp/videos.avro', local_file_name)

    load_and_save_using_spark = SparkSubmitOperator(
        task_id="cassandra_to_avro_spark",
        conn_id="spark_local",
        name="cassandra_to_avro_spark",
        application="dags/cassandra_to_avro_spark.py",
        packages="org.apache.spark:spark-avro_2.12:3.1.1,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0",
    )
        
    # ctx = get_current_context()
    table_sensor = CassandraTableSensor(
        task_id="cassandra_table_sensor",
        cassandra_conn_id='local_cassandra',
        table="killrvideo.videos",
    )

    # load = load_from_cassandra()
    # write_to_hdfs(load)
    table_sensor >> load_and_save_using_spark
def covid_per_popgroup_subdag(parent_dag_id, child_dag_id, args):
    with DAG(
            dag_id=f'{parent_dag_id}.{child_dag_id}',
            default_args=args,
            #start_date= datetime(2021,5,2), #days_ago(2), #datetime.datetime.now(), #days_ago(2),
            #schedule_interval = '@once',
            #tags=['covid'],
    ) as dag:

        last_date_popgroup_task = get_last_date_popgroup(
            task_id="last_date_popgroup_task")

        current_dir = os.path.dirname(os.path.abspath(__file__))
        root_dir = os.path.dirname(current_dir)
        download_recent_cdc_task = SparkSubmitOperator(
            task_id="download_recent_cdc_task",
            conn_id="spark_default",
            application=os.path.join(root_dir, "python",
                                     "stage_recent_cdc.py"),
            application_args=[
                "--apptoken",
                Variable.get("socrata_apptoken"),
                "--last_date",
                "{{ti.xcom_pull( task_ids = 'last_date_popgroup_task', key = 'last_cdc_date')}}",
            ],
        )

        insert_covid_pergroup_task = PostgresOperator(
            task_id="insert_covid_pergroup_task",
            postgres_conn_id="postgres_default",
            sql="""
                    INSERT INTO covid_per_popgroup(cdc_case_earliest_dt, sex_id, age_group_id, race_ethnicity_id, count)
                    SELECT cdc_case_earliest_dt, sex_id, age_group_id, race_ethnicity_id, count
                    FROM recent_cdc AS n
                    JOIN  dim_age_group AS a ON a.age_group = n.age_group
                    JOIN dim_sex AS s ON s.sex = n.sex
                    JOIN dim_race_ethnicity AS e ON e.race = n.race_ethnicity_combined
                    ;
                """)
        last_date_popgroup_task >> download_recent_cdc_task >> insert_covid_pergroup_task
        return dag
Exemplo n.º 4
0
"""
from airflow.models import DAG
from airflow.providers.apache.spark.operators.spark_jdbc import SparkJDBCOperator
from airflow.providers.apache.spark.operators.spark_sql import SparkSqlOperator
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
from airflow.utils.dates import days_ago

args = {'owner': 'Airflow', 'start_date': days_ago(2)}

with DAG(dag_id='example_spark_operator',
         default_args=args,
         schedule_interval=None,
         tags=['example']) as dag:
    # [START howto_operator_spark_submit]
    submit_job = SparkSubmitOperator(
        application="${SPARK_HOME}/examples/src/main/python/pi.py",
        task_id="submit_job")
    # [END howto_operator_spark_submit]

    # [START howto_operator_spark_jdbc]
    jdbc_to_spark_job = SparkJDBCOperator(
        cmd_type='jdbc_to_spark',
        jdbc_table="foo",
        spark_conf={},
        spark_jars="${SPARK_HOME}/jars/postgresql-42.2.12.jar",
        jdbc_driver="org.postgresql.Driver",
        metastore_table="bar",
        save_mode="overwrite",
        save_format="JSON",
        task_id="jdbc_to_spark_job")
Exemplo n.º 5
0
    def test_execute(self):

        # Given / When
        conn_id = 'spark_default'
        operator = SparkSubmitOperator(task_id='spark_submit_job',
                                       spark_binary="sparky",
                                       dag=self.dag,
                                       **self._config)

        # Then expected results
        expected_dict = {
            'conf': {
                'parquet.compression': 'SNAPPY'
            },
            'files':
            'hive-site.xml',
            'py_files':
            'sample_library.py',
            'archives':
            'sample_archive.zip#SAMPLE',
            'driver_class_path':
            'parquet.jar',
            'jars':
            'parquet.jar',
            'packages':
            'com.databricks:spark-avro_2.11:3.2.0',
            'exclude_packages':
            'org.bad.dependency:1.0.0',
            'repositories':
            'http://myrepo.org',
            'total_executor_cores':
            4,
            'executor_cores':
            4,
            'executor_memory':
            '22g',
            'keytab':
            'privileged_user.keytab',
            'principal':
            'user/[email protected]',
            'proxy_user':
            '******',
            'name':
            '{{ task_instance.task_id }}',
            'num_executors':
            10,
            'status_poll_interval':
            30,
            'verbose':
            True,
            'application':
            'test_application.py',
            'driver_memory':
            '3g',
            'java_class':
            'com.foo.bar.AppMain',
            'application_args': [
                '-f',
                'foo',
                '--bar',
                'bar',
                '--start',
                '{{ macros.ds_add(ds, -1)}}',
                '--end',
                '{{ ds }}',
                '--with-spaces',
                'args should keep embdedded spaces',
            ],
            'spark_binary':
            'sparky',
        }

        self.assertEqual(conn_id, operator._conn_id)
        self.assertEqual(expected_dict['application'], operator._application)
        self.assertEqual(expected_dict['conf'], operator._conf)
        self.assertEqual(expected_dict['files'], operator._files)
        self.assertEqual(expected_dict['py_files'], operator._py_files)
        self.assertEqual(expected_dict['archives'], operator._archives)
        self.assertEqual(expected_dict['driver_class_path'],
                         operator._driver_class_path)
        self.assertEqual(expected_dict['jars'], operator._jars)
        self.assertEqual(expected_dict['packages'], operator._packages)
        self.assertEqual(expected_dict['exclude_packages'],
                         operator._exclude_packages)
        self.assertEqual(expected_dict['repositories'], operator._repositories)
        self.assertEqual(expected_dict['total_executor_cores'],
                         operator._total_executor_cores)
        self.assertEqual(expected_dict['executor_cores'],
                         operator._executor_cores)
        self.assertEqual(expected_dict['executor_memory'],
                         operator._executor_memory)
        self.assertEqual(expected_dict['keytab'], operator._keytab)
        self.assertEqual(expected_dict['principal'], operator._principal)
        self.assertEqual(expected_dict['proxy_user'], operator._proxy_user)
        self.assertEqual(expected_dict['name'], operator._name)
        self.assertEqual(expected_dict['num_executors'],
                         operator._num_executors)
        self.assertEqual(expected_dict['status_poll_interval'],
                         operator._status_poll_interval)
        self.assertEqual(expected_dict['verbose'], operator._verbose)
        self.assertEqual(expected_dict['java_class'], operator._java_class)
        self.assertEqual(expected_dict['driver_memory'],
                         operator._driver_memory)
        self.assertEqual(expected_dict['application_args'],
                         operator._application_args)
        self.assertEqual(expected_dict['spark_binary'], operator._spark_binary)
Exemplo n.º 6
0
                )
            COMMENT 'Main Table'
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            TBLPROPERTIES ("skip.header.line.count"="1");
        """)

    populate_hive_table = HiveOperator(task_id='populate_hive_table',
                                       hive_cli_conn_id='hive_conn',
                                       hql="""
            LOAD DATA INPATH '/covidData/owid-covid-data.csv' INTO TABLE cov_data
        """)

    processing = SparkSubmitOperator(
        task_id='processing',
        application="/opt/airflow/dags/scripts/spark_processing.py",
        conn_id='spark_conn',
        verbose=True)

    copy_to_files = BashOperator(task_id='copy_to_files',
                                 bash_command="""
        hdfs dfs -get -f /IndiaCOVID /opt/airflow/dags/files
        """)

    task_uploads3 = PythonOperator(task_id='task_uploads3',
                                   python_callable=uploads3,
                                   op_kwargs={
                                       'filename':
                                       '/opt/airflow/dags/files/IndiaCOVID',
                                       'key': 'f',
                                       'bucket_name': 'c19-backups-airflow'
Exemplo n.º 7
0
                nzd DOUBLE,
                gbp DOUBLE,
                jpy DOUBLE,
                cad DOUBLE
                )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE
        """
    )

    forex_processing = SparkSubmitOperator(
        task_id='forex_processing',
        conn_id='spark_conn',
        application="/User/arunraja/airflow/dags/scripts/forex_processing.py",
        verbose=False,
        executor_cores=2,
        num_executors=2,
        executor_memory='256M',
        driver_memory='1G'
    )

    sending_email_notification = EmailOperator(
            task_id="sending_email",
            to="*****@*****.**",
            subject="forex_data_pipeline",
            html_content="""
                <h3>forex_data_pipeline succeeded</h3>
            """
        )
    
Exemplo n.º 8
0
    "applications.sink.prometheusServlet.path":
    "/metrics/applications/prometheus",
    "spark.kubernetes.executor.label.metrics-exposed": "true",
    "spark.kubernetes.driver.label.metrics-exposed": "true"
}

with DAG(dag_id="ddt-ingestion",
         schedule_interval="@hourly",
         default_args=default_args,
         catchup=False) as dag:

    stage_1 = SparkSubmitOperator(
        task_id="stage1",
        application="/opt/airflow/dags/repo/from_kafka_to_minio_streaming.py",
        conn_id="k8s_cluster",
        name="stage1",
        packages=
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.apache.kafka:kafka-clients:2.7.0,org.apache.hadoop:hadoop-aws:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.13.1",
        conf=spark_conf,
        verbose=False)

    stage_2 = SparkSubmitOperator(
        task_id="stage2",
        application="/opt/airflow/dags/repo/parquet_enrichment.py",
        conn_id="k8s_cluster",
        name="stage2",
        packages=
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.apache.kafka:kafka-clients:2.7.0,org.apache.hadoop:hadoop-aws:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.13.1",
        conf=spark_conf,
        verbose=False)
    stage_1 >> stage_2
Exemplo n.º 9
0
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule

dag = DAG(
    dag_id='datalake_dag',
    start_date=days_ago(2),
    schedule_interval='@once'
)

ingest_country_info_task = SparkSubmitOperator(
    application="/jars/spark-task-1.0-SNAPSHOT-all.jar",
    task_id="ingest_country_info_task",
    conn_id="spark_cluster",
    dag=dag,
    java_class="com.genestack.tasks.DeltaLakeFileIngestorKt",
    conf={"spark.standalone.submit.waitAppCompletion": "true"},
    application_args=[
        "s3a://genestack-spark-test/country-info.json",  # input file
        "json",  # input file format
        "country-info"  # output file
    ]
)

ingest_covid_deaths_task = SparkSubmitOperator(
    application="/jars/spark-task-1.0-SNAPSHOT-all.jar",
    task_id="ingest_covid_deaths_task",
    conn_id="spark_cluster",
    dag=dag,
    java_class="com.genestack.tasks.DeltaLakeFileIngestorKt",
    conf={"spark.standalone.submit.waitAppCompletion": "true"},
    application_args=[
Exemplo n.º 10
0
    schedule_interval='0 0 * * *',
    start_date=days_ago(2),
    dagrun_timeout=timedelta(minutes=60),
    tags=['spark'],
    params={"param": "value"},
)

run_this_last = DummyOperator(
    task_id='run_this_last',
    dag=dag,
)

# TODO - Here we are running a pi.py script. Change path to youre location.
flight_search_ingestion = SparkSubmitOperator(
    task_id='flight_search_ingestion',
    conn_id='spark_default',
    application=
    '/home/ubuntu/anaconda3/envs/airflow/lib/python3.6/site-packages/pyspark/examples/src/main/python/pi.py',
    total_executor_cores=4,
    executor_cores=2,
    executor_memory='1g',
    driver_memory='1g',
    name='flight_search_ingestion',
    execution_timeout=timedelta(seconds=100000),
    dag=dag)

run_this_last >> flight_search_ingestion

if __name__ == "__main__":
    dag.cli()
Exemplo n.º 11
0
                    eur DOUBLE,
                    usd DOUBLE,
                    nzd DOUBLE,
                    gbp DOUBLE,
                    jpy DOUBLE,
                    cad DOUBLE
                    )
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                STORED AS TEXTFILE
            """)

    #Spark operator
    forex_processing = SparkSubmitOperator(
        task_id="forex_processing",
        application="/home/enes/airflow2/dags/scripts/forex_processing.py",
        conn_id="spark_conn",
        verbose=False)

    #Email operator
    send_email_notification = EmailOperator(
        task_id="send_email_notification",
        to="*****@*****.**",
        subject="forex_data_pipeline",
        html_content="<h3>forex_data_pipeline</h3>")

    #Slack notification operator
    send_slack_notification = SlackWebhookOperator(
        task_id="send_slack_notification",
        http_conn_id="slack_conn",
        message=_get_message(),
Exemplo n.º 12
0
    python_callable=load_s3,
    op_kwargs={'bucket_name': 'mybucket-test2', 'source_file_path': 'source_data/user_info.csv', 'dest_aws_file_name':'users_info/user_info.csv'},
    dag=mydag,
)

task3 = PythonOperator(
    task_id='load_s3_3',
    python_callable=load_s3,
    op_kwargs={'bucket_name': 'mybucket-test2', 'source_file_path': 'source_data/prices_1.csv', 'dest_aws_file_name':'prices/prices_1.csv'},
    dag=mydag,
)

task4 = PythonOperator(
    task_id='load_s3_4',
    python_callable=load_s3,
    op_kwargs={'bucket_name': 'mybucket-test2', 'source_file_path': 'source_data/sales_per_user.csv', 'dest_aws_file_name':'sales/sales_per_user.csv'},
    dag=mydag,
)

task5 = SparkSubmitOperator(
    task_id='task_aws_s3_pyspark',
    application='s3redshift.py',
    dag=mydag,
    packages='com.amazon.redshift:redshift-jdbc42-no-awssdk:1.2.45.1069,com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-auth:2.7.4,org.apache.hadoop:hadoop-common:2.7.4,com.google.code.findbugs:jsr305:3.0.2,asm:asm:3.2,org.slf4j:slf4j-api:1.7.30,org.xerial.snappy:snappy-java:1.1.7.5,org.slf4j:slf4j-log4j12:1.7.30,org.apache.hadoop:hadoop-aws:2.7.3',
    conn_id= 'my_spark_standalone'
)

#----dependencies

[task1,task2,task3,task4] >> task5
Exemplo n.º 13
0
    with dag:
        env = {
            'SPLICE_JUPYTER_USER':
            env_vars.get('SPLICE_JUPYTER_USER') or env_vars.get('DB_USER'),
            'SPLICE_JUPYTER_PASSWORD':
            env_vars.get('SPLICE_JUPYTER_PASSWORD')
            or env_vars.get('DB_PASSWORD'),
            'SPLICE_DB_HOST':
            env_vars.get('SPLICE_DB_HOST') or env_vars.get('DB_HOST'),
            'SPLICE_KAFKA_HOST':
            env_vars.get('SPLICE_KAFKA_HOST')
        }

        conf_path = '/mnt/airflow-conf/extra_spark_config.json'
        if path.exists(conf_path):
            with open(conf_path) as f:
                extra_conf = json.load(f)
        else:
            extra_conf = {}

        calculate_statistics_task = SparkSubmitOperator(
            application="/opt/airflow/spark_apps/pipeline.py",
            task_id="run_pipeline",
            conn_id="splice_spark",
            env_vars=env,
            application_args=[fset],
            **spark_defaults,
            **extra_conf)

    globals()[dag_id] = dag