Exemplo n.º 1
0
    def test_read_spec_from_file(self):
        open_mock = mock.mock_open(read_data='{"some": "json"}')
        with mock.patch('airflow.providers.apache.druid.operators.druid.open',
                        open_mock,
                        create=True) as open_mock:
            druid = DruidOperator(task_id='druid_indexing_job',
                                  json_index_file='index_spec.json',
                                  dag=self.dag)

            open_mock.assert_called_once_with('index_spec.json')
            self.assertEqual(druid.index_spec_str, '{\n    "some": "json"\n}')
Exemplo n.º 2
0
    def test_render_template(self):
        operator = DruidOperator(
            task_id='spark_submit_job',
            json_index_file=self.json_index_str,
            params={'index_type': 'index_hadoop', 'datasource': 'datasource_prd'},
            dag=self.dag,
        )
        ti = TaskInstance(operator, DEFAULT_DATE)
        ti.render_templates()

        assert self.rendered_index_str == operator.json_index_file
Exemplo n.º 3
0
    def test_render_template_from_file(self):
        with NamedTemporaryFile("w", suffix='.json') as f:
            f.write(self.json_index_str)
            f.flush()

            self.dag.template_searchpath = os.path.dirname(f.name)

            operator = DruidOperator(
                task_id='spark_submit_job',
                json_index_file=f.name,
                params={'index_type': 'index_hadoop', 'datasource': 'datasource_prd'},
                dag=self.dag,
            )
            ti = TaskInstance(operator, DEFAULT_DATE)
            ti.render_templates()

            assert self.rendered_index_str == operator.json_index_file
Exemplo n.º 4
0
    def test_render_template(self):
        json_str = '''
            {
                "type": "{{ params.index_type }}",
                "datasource": "{{ params.datasource }}",
                "spec": {
                    "dataSchema": {
                        "granularitySpec": {
                            "intervals": ["{{ ds }}/{{ macros.ds_add(ds, 1) }}"]
                        }
                    }
                }
            }
        '''
        open_mock = mock.mock_open(read_data=json_str)
        with mock.patch('airflow.providers.apache.druid.operators.druid.open',
                        open_mock,
                        create=True) as open_mock:
            operator = DruidOperator(task_id='spark_submit_job',
                                     json_index_file='index_spec.json',
                                     params={
                                         'index_type': 'index_hadoop',
                                         'datasource': 'datasource_prd'
                                     },
                                     dag=self.dag)
            ti = TaskInstance(operator, DEFAULT_DATE)
            ti.render_templates()

            open_mock.assert_called_once_with('index_spec.json')
            expected = '''{
    "datasource": "datasource_prd",
    "spec": {
        "dataSchema": {
            "granularitySpec": {
                "intervals": [
                    "2017-01-01/2017-01-02"
                ]
            }
        }
    },
    "type": "index_hadoop"
}'''
            self.assertEqual(expected, getattr(operator, 'index_spec_str'))
Exemplo n.º 5
0
 def test_render_template(self):
     json_str = '''
         {
             "type": "{{ params.index_type }}",
             "datasource": "{{ params.datasource }}",
             "spec": {
                 "dataSchema": {
                     "granularitySpec": {
                         "intervals": ["{{ ds }}/{{ macros.ds_add(ds, 1) }}"]
                     }
                 }
             }
         }
     '''
     operator = DruidOperator(
         task_id='spark_submit_job',
         json_index_file=json_str,
         params={
             'index_type': 'index_hadoop',
             'datasource': 'datasource_prd'
         },
         dag=self.dag
     )
     ti = TaskInstance(operator, DEFAULT_DATE)
     ti.render_templates()
     expected = '''
         {
             "type": "index_hadoop",
             "datasource": "datasource_prd",
             "spec": {
                 "dataSchema": {
                     "granularitySpec": {
                         "intervals": ["2017-01-01/2017-01-02"]
                     }
                 }
             }
         }
     '''
     self.assertEqual(expected, getattr(operator, 'json_index_file'))
Exemplo n.º 6
0
Example Airflow DAG to submit Apache Druid json index file using `DruidOperator`
"""
from airflow.models import DAG
from airflow.providers.apache.druid.operators.druid import DruidOperator
from airflow.utils.dates import days_ago

with DAG(
        dag_id='example_druid_operator',
        schedule_interval=None,
        start_date=days_ago(2),
        tags=['example'],
) as dag:
    # [START howto_operator_druid_submit]
    submit_job = DruidOperator(
        task_id='spark_submit_job',
        json_index_file='json_index.json',
        druid_ingest_conn_id='druid_ingest_default',
    )
    # Example content of json_index.json:
    JSON_INDEX_STR = """
        {
            "type": "index_hadoop",
            "datasource": "datasource_prd",
            "spec": {
                "dataSchema": {
                    "granularitySpec": {
                        "intervals": ["2021-09-01/2021-09-02"]
                    }
                }
            }
        }
Exemplo n.º 7
0
# Default settings applied to all tasks
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('druid-ingest',
         start_date=datetime(2021, 5, 5),
         max_active_runs=3,
         schedule_interval=None,
         default_args=default_args,
         ) as dag:

    # A dummy processor which pushes the uris and intervals to backend db using xcom
    # this can be any processor which sinks the processed data to s3 or gs and pushes the links using xcom.
    dummy_processor = PythonOperator(task_id='dummy_processor',
                                     python_callable=dummy_processor)

    # a sample druid operator which pulls the uris and intervals from dependent operators. ex: dummy_processor operator
    # replaces the links and intervals in the wikipedia-index.json dynamically
    ingest_data = DruidOperator(task_id='druid_ingest',
                                json_index_file='wikipedia-index.json',
                                druid_ingest_conn_id='druid_ingest_conn_id')

    dummy_processor >> ingest_data
Exemplo n.º 8
0
Example Airflow DAG to submit Apache Druid json index file using `DruidOperator`
"""
from datetime import datetime

from airflow.models import DAG
from airflow.providers.apache.druid.operators.druid import DruidOperator

with DAG(
        dag_id='example_druid_operator',
        schedule_interval=None,
        start_date=datetime(2021, 1, 1),
        catchup=False,
        tags=['example'],
) as dag:
    # [START howto_operator_druid_submit]
    submit_job = DruidOperator(task_id='spark_submit_job',
                               json_index_file='json_index.json')
    # Example content of json_index.json:
    JSON_INDEX_STR = """
        {
            "type": "index_hadoop",
            "datasource": "datasource_prd",
            "spec": {
                "dataSchema": {
                    "granularitySpec": {
                        "intervals": ["2021-09-01/2021-09-02"]
                    }
                }
            }
        }
    """
    # [END howto_operator_druid_submit]