Exemplos de DataFlowPythonOperator em Python, exemplos de airflow.contrib.operators.dataflow_operator.DataFlowPythonOperator em Python

Exemplo n.º 1

0

Exibir arquivo

 def setUp(self):
     self.dataflow = DataFlowPythonOperator(
         task_id=TASK_ID,
         py_file=PY_FILE,
         py_options=PY_OPTIONS,
         dataflow_default_options=DEFAULT_OPTIONS,
         options=ADDITIONAL_OPTIONS)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: dataflow_operator.py Projeto: gritlogic/apache-airflow

class DataFlowPythonOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataFlowPythonOperator(
            task_id=TASK_ID,
            py_file=PY_FILE,
            py_options=PY_OPTIONS,
            dataflow_default_options=DEFAULT_OPTIONS,
            options=ADDITIONAL_OPTIONS)

    def test_init(self):
        """Test DataFlowPythonOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.py_file, PY_FILE)
        self.assertEqual(self.dataflow.py_options, PY_OPTIONS)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS)
        self.assertEqual(self.dataflow.options, ADDITIONAL_OPTIONS)

    @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook')
    def test_exec(self, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_python_workflow.

        """
        start_python_hook = dataflow_mock.return_value.start_python_dataflow
        self.dataflow.execute(None)
        assert dataflow_mock.called
        expected_options = {
            'project': 'test',
            'staging_location': 'gs://test/staging',
            'output': 'gs://test/output'
        }
        start_python_hook.assert_called_once_with(TASK_ID, expected_options,
                                                  PY_FILE, PY_OPTIONS)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_dataflow_operator.py Projeto: iamon3/incubator-airflow-personal

 def setUp(self):
     self.dataflow = DataFlowPythonOperator(
         task_id=TASK_ID,
         py_file=PY_FILE,
         py_options=PY_OPTIONS,
         dataflow_default_options=DEFAULT_OPTIONS_PYTHON,
         options=ADDITIONAL_OPTIONS,
         poll_sleep=POLL_SLEEP)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_dataflow_operator.py Projeto: Nextdoor/airflow

 def setUp(self):
     self.dataflow = DataFlowPythonOperator(
         task_id=TASK_ID,
         py_file=PY_FILE,
         py_options=PY_OPTIONS,
         dataflow_default_options=DEFAULT_OPTIONS,
         options=ADDITIONAL_OPTIONS)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_dataflow_operator.py Projeto: caseybrown89/airflow

 def setUp(self):
     self.dataflow = DataFlowPythonOperator(
         task_id=TASK_ID,
         py_file=PY_FILE,
         py_options=PY_OPTIONS,
         dataflow_default_options=DEFAULT_OPTIONS_PYTHON,
         options=ADDITIONAL_OPTIONS,
         poll_sleep=POLL_SLEEP)

Exemplo n.º 6

0

Exibir arquivo

class DataFlowPythonOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataFlowPythonOperator(
            task_id=TASK_ID,
            py_file=PY_FILE,
            job_name=JOB_NAME,
            py_options=PY_OPTIONS,
            dataflow_default_options=DEFAULT_OPTIONS_PYTHON,
            options=ADDITIONAL_OPTIONS,
            poll_sleep=POLL_SLEEP)

    def test_init(self):
        """Test DataFlowPythonOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.job_name, JOB_NAME)
        self.assertEqual(self.dataflow.py_file, PY_FILE)
        self.assertEqual(self.dataflow.py_options, PY_OPTIONS)
        self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS_PYTHON)
        self.assertEqual(self.dataflow.options, EXPECTED_ADDITIONAL_OPTIONS)

    @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudBucketHelper'))
    def test_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_python_workflow.

        """
        start_python_hook = dataflow_mock.return_value.start_python_dataflow
        gcs_download_hook = gcs_hook.return_value.google_cloud_to_local
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'staging_location': 'gs://test/staging',
            'output': 'gs://test/output',
            'labels': {
                'foo': 'bar',
                'airflow-version': TEST_VERSION
            }
        }
        gcs_download_hook.assert_called_once_with(PY_FILE)
        start_python_hook.assert_called_once_with(JOB_NAME, expected_options,
                                                  mock.ANY, PY_OPTIONS)
        self.assertTrue(self.dataflow.py_file.startswith('/tmp/dataflow'))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_dataflow_operator.py Projeto: nbrgil/incubator-airflow

class DataFlowPythonOperatorTest(unittest.TestCase):

    def setUp(self):
        self.dataflow = DataFlowPythonOperator(
            task_id=TASK_ID,
            py_file=PY_FILE,
            job_name=JOB_NAME,
            py_options=PY_OPTIONS,
            dataflow_default_options=DEFAULT_OPTIONS_PYTHON,
            options=ADDITIONAL_OPTIONS,
            poll_sleep=POLL_SLEEP)

    def test_init(self):
        """Test DataFlowPythonOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.job_name, JOB_NAME)
        self.assertEqual(self.dataflow.py_file, PY_FILE)
        self.assertEqual(self.dataflow.py_options, PY_OPTIONS)
        self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS_PYTHON)
        self.assertEqual(self.dataflow.options,
                         EXPECTED_ADDITIONAL_OPTIONS)

    @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudBucketHelper'))
    def test_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_python_workflow.

        """
        start_python_hook = dataflow_mock.return_value.start_python_dataflow
        gcs_download_hook = gcs_hook.return_value.google_cloud_to_local
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'staging_location': 'gs://test/staging',
            'output': 'gs://test/output',
            'labels': {'foo': 'bar', 'airflow-version': TEST_VERSION}
        }
        gcs_download_hook.assert_called_once_with(PY_FILE)
        start_python_hook.assert_called_once_with(JOB_NAME, expected_options, mock.ANY,
                                                  PY_OPTIONS)
        self.assertTrue(self.dataflow.py_file.startswith('/tmp/dataflow'))

Exemplo n.º 8

0

Exibir arquivo

def storage_to_bq_task(filename):
    """Cria a task que executa o pipeline do Dataflow"""

    opt_dict = {
        'file_path': "{}/{}.csv".format(raw_files_path, filename),
        'header': getattr(headers, filename),
        'destination_table_id': "{}.{}".format(bq_dataset_landing, filename)
    }

    return DataFlowPythonOperator(task_id='load_{}'.format(filename),
                                  py_file=DATAFLOW_PIPELINE_FILE,
                                  job_name=re.sub('_', '-', filename),
                                  options=opt_dict)

Exemplo n.º 9

0

Exibir arquivo

class DataFlowPythonOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataFlowPythonOperator(
            task_id=TASK_ID,
            py_file=PY_FILE,
            py_options=PY_OPTIONS,
            dataflow_default_options=DEFAULT_OPTIONS,
            options=ADDITIONAL_OPTIONS)

    def test_init(self):
        """Test DataFlowPythonOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.py_file, PY_FILE)
        self.assertEqual(self.dataflow.py_options, PY_OPTIONS)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS)
        self.assertEqual(self.dataflow.options, ADDITIONAL_OPTIONS)

    @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudStorageHook'))
    def test_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_python_workflow.

        """
        start_python_hook = dataflow_mock.return_value.start_python_dataflow
        gcs_download_hook = gcs_hook.return_value.download
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'staging_location': 'gs://test/staging',
            'output': 'gs://test/output'
        }
        gcs_download_hook.assert_called_once_with('my-bucket', 'my-object.py',
                                                  mock.ANY)
        start_python_hook.assert_called_once_with(TASK_ID, expected_options,
                                                  mock.ANY, PY_OPTIONS)
        self.assertTrue(self.dataflow.py_file.startswith('/tmp/dataflow'))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: scheduler.py Projeto: qvik/gcp-data-training

               'project': project_id,
               'region': 'europe-west1',
               'staging_location': 'gs://' + project_id + '-dataflow/staging/',
               'temp_location': 'gs://' + project_id + '-dataflow/temp/'
               }
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        'hourly_data_transfer',
        schedule_interval='0 * * * *',
        default_args=default_dag_args) as dag:

    run_dataflow = DataFlowPythonOperator(
        py_file='gs://' + project_id + '-dataflow/pipelines/batch_pipeline.py',
        task_id='run_Dataflow_from_BQ_to_Datastore',
        dataflow_default_options=default_dag_args['dataflow_default_options']
    )

    bq_to_gcs = BigQueryToCloudStorageOperator(
        task_id='export_stream_data_from_BQ',
        source_project_dataset_table='my_dataset.stream_data',
        destination_cloud_storage_uris=['gs://' + project_id + '-data-export/stream_data.csv'],
        export_format='CSV')

    # Define DAG dependencies.
    run_dataflow
    bq_to_gcs

Exemplo n.º 11

0

Exibir arquivo

Arquivo: realestate.py Projeto: emgsilva/airflow-training-skeleton

    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-fc205e26bebb44fa",
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag,
)

load_into_bigquery = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        'region': "europe-west1",
        'input': "gs://airflow-emgsilva/land_registry_price/{{ ds }}/*.json",
        'table': 'emgsilva',
        'dataset': 'emgsilva',
        'project': 'airflowbolcom-fc205e26bebb44fa',
        'bucket': 'europe-west1-training-airfl-46f2603e-bucket',
        'job_name': '{{ task_instance_key_str }}'
    },
    py_file=
    "gs://europe-west1-training-airfl-46f2603e-bucket/dags/dataflow_job.py",
    dag=dag,
)

[pgsl_to_gcs, http_to_gcs_op] >> load_into_bigquery
[pgsl_to_gcs, http_to_gcs_op
 ] >> dataproc_create_cluster >> compute_aggregates >> dataproc_delete_cluster

Exemplo n.º 12

0

Exibir arquivo

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG('Projet_SMART_GCP',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    dataflow_Plans = DataFlowPythonOperator(
        py_file='/home/airflow/gcs/dags/xmlload.py',
        task_id='dataflow_plans',
        options={
            'input': 'Plans.xml',
            'output': 'studied-client-307710:SMT_STG.Plans',
            'setup_file': '/home/airflow/gcs/dags/setup.py'
        },
        dataflow_default_options={
            'project': 'studied-client-307710',
            'staging_location': 'gs://projet_smart_gcp/tmp',
            'temp_location': 'gs://projet_smart_gcp/tmp'
        },
        requirements=['google-cloud-storage==1.36.1', 'xmltodict==0.12.0'],
        dag=dag)

    dataflow_ConcernedFunctions = DataFlowPythonOperator(
        py_file='/home/airflow/gcs/dags/xmlload.py',
        task_id='dataflow_concernedfunctions',
        options={
            'input': 'CS_ConcernedFunctions.xml',
            'output': 'studied-client-307710:SMT_STG.CS_ConcernedFunctions',
            'setup_file': '/home/airflow/gcs/dags/setup.py'

Exemplo n.º 13

0

Exibir arquivo

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-ea393e48abe0a85089b6b551da",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
    auto_delete_ttl=5 * 60,  # Autodelete after 5 minutes
)


df_to_bq = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-ea393e48abe0a85089b6b551da",
        "region": "europe-west1",
    },
    py_file="gs://airflow-training-knab-jochem/dataflow_job.py",
    dag=dag,
)


for currency in {'EUR', 'USD'}:
    s = HttpToGcsOperator(
        task_id="get_currency_" + currency,
        method="GET",
        endpoint="airflow-training-transform-valutas?date={{ ds }}&from=GBP&to=" + currency,
        http_conn_id="airflow-training-currency-http",
        gcs_conn_id="airflow-training-storage-bucket",
        bucket="airflow-training-knab-jochem",
        gcs_path="currency/{{ ds }}-" + currency + ".json",

Exemplo n.º 14

0

Exibir arquivo

Arquivo: dag.py Projeto: danielvdende/airflow-training-skeleton

    dag=dag,
)

load_into_bigquery = DataFlowPythonOperator(
    task_id="load-into-bq",
    dataflow_default_options={
        "project":
        project_id,
        "region":
        "europe-west1",
        "staging_location":
        "gs://{bucket}/airflow-training-data/dataflow-staging".format(
            bucket=bucket_name),
        "temp_location":
        "gs://{bucket}/airflow-training-data/dataflow-temp".format(
            bucket=bucket_name),
    },
    py_file="gs://{bucket}/dataflow_job.py".format(bucket=bucket_name),
    options={
        "input":
        "gs://{bucket}/airflow-training-data/land_registry_price_paid_uk/{{{{ ds }}}}/"
        .format(bucket=bucket_name),
        "table":
        "first_result_table",
        "dataset":
        "airflow_dataset",
    },
    dag=dag,
)

results_to_bigquery = GoogleCloudStorageToBigQueryOperator(
    task_id='results_to_bigquery',

Exemplo n.º 15

0

Exibir arquivo

Arquivo: airflow.py Projeto: skamalj/google-composer

                          cluster_name='mydataproc2',
                          region='us-central1',
                          task_id='create_table_in_hive_2_cols',
                          dag=dag)

t3 = DataProcHiveOperator(query=query_all,
                          cluster_name='mydataproc2',
                          region='us-central1',
                          task_id='create_table_in_hive_all_cols',
                          dag=dag)

t4 = DataFlowPythonOperator(
    py_file='gs://mysqlnosql/beam_gcs_bt.py',
    task_id='loadfrom-gcs-to-bt',
    dataflow_default_options={'project': '<yourprojectid>'},
    options={
        'avro_input': 'gs://mysqldataflow/avro/customer/',
        'json_input': 'gs://mysqldataflow/json/customer/'
    },
    dag=dag)

t5 = DataFlowPythonOperator(
    py_file='gs://mysqlnosql/beam_gcs_datastore.py',
    task_id='loadfrom-gcs-to-datastore',
    dataflow_default_options={'project': '<yourprojectid>'},
    options={'json_input': 'gs://mysqldataflow/json/customer/'},
    dag=dag)

t1 >> t2
t1 >> t3
t1 >> t4

Exemplo n.º 16

0

Exibir arquivo

Arquivo: cloudml_operator_utils.py Projeto: zhangela/incubator-airflow

def create_evaluate_ops(task_prefix,
                        project_id,
                        job_id,
                        region,
                        data_format,
                        input_paths,
                        prediction_path,
                        metric_fn_and_keys,
                        validate_fn,
                        dataflow_options,
                        model_uri=None,
                        model_name=None,
                        version_name=None,
                        dag=None):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling CloudMLBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.
    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    def get_metric_fn_and_keys():
        import math  # imports should be outside of the metric_fn below.
        def error_and_squared_error(inst):
            label = float(inst['input_label'])
            classes = float(inst['classes'])  # 0 or 1
            err = abs(classes-label)
            squared_err = math.pow(classes-label, 2)
            return (err, squared_err)  # returns a tuple.
        return error_and_squared_error, ['err', 'mse']  # key order must match.

    def validate_err_and_count(summary):
        if summary['err'] > 0.2:
            raise ValueError('Too high err>0.2; summary=%s' % summary)
        if summary['mse'] > 0.05:
            raise ValueError('Too high mse>0.05; summary=%s' % summary)
        if summary['count'] < 1000:
            raise ValueError('Too few instances<1000; summary=%s' % summary)
        return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to CloudMLBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: string

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See CloudMLBatchPredictionOperator for
        more detail.
    :type model_uri: string

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See CloudMLBatchPredictionOperator for more detail.
    :type model_name: string

    :param version_name: Used to indicate a model version to use for prediciton,
        in combination with model_name. Cannot be used together with model_uri.
        See CloudMLBatchPredictionOperator for more detail.
    :type version_name: string

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: string

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list of strings

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: string

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:
        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.
        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list of strings

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param dataflow_options: options to run Dataflow jobs.
    :type dataflow_options: dictionary

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix)

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    evaluate_prediction = CloudMLBatchPredictionOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=_normalize_cloudml_job_id(job_id),
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag)

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True))
    evaluate_summary = DataFlowPythonOperator(
        task_id=(task_prefix + "-summary"),
        py_options=["-m"],
        py_file="airflow.contrib.operators.cloudml_prediction_summary",
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys)
        },
        dag=dag)
    # TODO: "options" is not template_field of DataFlowPythonOperator (not sure
    # if intended or by mistake); consider fixing in the DataFlowPythonOperator.
    evaluate_summary.template_fields.append("options")
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, **kwargs):
        prediction_path = kwargs["templates_dict"]["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError("Wrong format prediction_path: %s",
                             prediction_path)
        summary = os.path.join(obj.strip("/"), "prediction.summary.json")
        gcs_hook = GoogleCloudStorageHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        provide_context=True,
        templates_dict={"prediction_path": prediction_path},
        dag=dag)
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation

Exemplo n.º 17

0

Exibir arquivo

Arquivo: dag.py Projeto: halhorn/ASLOpenProject

with airflow.DAG('asl_ml_pipeline',
                 'catchup=False',
                 default_args=default_args,
                 schedule_interval=datetime.timedelta(days=1)) as dag:

    # Print the dag_run id from the Airflow logs
    start = DummyOperator(task_id='start')

    post_success_slack = SlackAPIPostOperator(
        task_id='post-success-to-slack',
        token=Variable.get('slack_access_token'),
        text='Hello Airflow',
        channel='#feed')

    post__fail_slack = SlackAPIPostOperator(
        task_id='post-fail-to-slack',
        token=Variable.get('slack_access_token'),
        trigger_rule=TriggerRule.ONE_FAILED,
        text='Hello World!',
        channel='#feed')

    job_args = {'output': 'gs://dev-recommend/preprocess'}
    data_flow = DataFlowPythonOperator(task_id='submit-job-data-flow',
                                       py_file=DATAFLOW_FILE,
                                       options=job_args)

    end = DummyOperator(task_id='end')

    start >> data_flow >> [post_success_slack, post__fail_slack] >> end

Exemplo n.º 18

0

Exibir arquivo

# endpoint = "https://europe-west1-gdd-airflow-training.cloudfunctions.net/airflow-training-transform-valutas?date=1970-01-01&from=GBP&to=EUR"  # noqa: E501
# bla = HttpToGcsOperator(
#     task_id="blaat",
#     endpoint=endpoint,
#     bucket="airflow_training_data",
#     bucket_path="currencies/{{ds_nodash}}_GB_EUR.json",
#     dag=dag
# )

load_into_bigquery = DataFlowPythonOperator(
    task_id="dataflow_to_bq",
    dataflow_default_options={
        "region": "europe-west1",
        "input": "gs://airflow_training_data/data_{{ds_nodash}}/*.json",
        "bucket": "airflow_training_data",
        "project": "airflowbolcom-b9aabd6971d488d9",
        "dataset": "airflow_training_dataset",
        "table": "dataflow_output",
        "name": "write-to-bq-{{ ds }}"
    },
    py_file="gs://airflow-training-data/dataflow_job.py",
    dag=dag)

options = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
    'Sunday'
]


def get_day(**context):
    return context['execution_date'].strftime('%A')

Exemplo n.º 19

0

Exibir arquivo

Arquivo: Run_DF_File_DAG.py Projeto: afshelton92/Cloud_Composer_Orchestration

dag = DAG('run_DF_file',
  default_args=default_args,
  schedule_interval=timedelta(days=1)
  )
t1 = BashOperator(
  task_id='copy_files',
  bash_command='gsutil -m cp [gs path to py file] /home/airflow/gcs/data/',
  dag=dag)



t3= DataFlowPythonOperator(
task_id= 'dataflow_runner',
py_file = '/home/airflow/gcs/data/main.py',
gcp_conn_id='google_cloud_default',
dataflow_default_options={
    "project": "[project_id]",
    "job_name": 'my_job',
    "temp_location": "[gs path to temp storage folder]",
    "staging_location": "[gs path to staging storage folder]"
    },
options={
    'input': "[gs path to input data]",
    'output': "[gs path for output data]"
    },
dag=dag
)


t1 >> t3

Exemplo n.º 20

0

Exibir arquivo

    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="dataproc_delete_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    dag=dag,
    project_id=PROJECT_ID,
    trigger_rule=TriggerRule.ALL_DONE,
)

write_prices_to_bq = DataFlowPythonOperator(
    task_id="write_prices_to_bq",
    dataflow_default_options={
        "project": PROJECT_ID,
        "region": "europe-west1"
    },
    py_file="gs://" + BUCKET + "/scripts/dataflow_job.py",
    dag=dag,
)

dataproc_compute_aggregates = DataProcPySparkOperator(
    task_id="dataproc_compute_aggregates",
    main="gs://airflow-training-data-tim/scripts/build_statistics.py",
    cluster_name="analyse-pricing-{{ ds }}",
    arguments=["{{ ds }}"],
    dag=dag,
)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="dataproc_create_cluster",

Exemplo n.º 21

0

Exibir arquivo

yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

default_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=1),
    'dataflow_default_options': {
       'project': PROJECT,
       'tempLocation': 'gs://{}/temp/'.format(BUCKET),
       'stagingLocation': 'gs://{}/staging/'.format(BUCKET),
       'inputFile': 'gs://dataflow-samples/shakespeare/kinglear.txt',
       'output': 'gs://{}/output'.format(BUCKET)
   }
}

with models.DAG(
        'dataflow_python2',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_args) as dag:

        task = DataFlowPythonOperator(
            py_file='/home/airflow/gcs/data/wordcount.py',
            task_id=JOB_NAME,
            dag=dag)

        task

Exemplo n.º 22

0

Exibir arquivo

Arquivo: Firsttest.py Projeto: Anyesh31/airflow-training-skeleton

    postgres_conn_id="airflow-training-postgres")

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id=PROJECT_ID,
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

land_registry_prices_to_bigquery = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-990fd90d0db6efbabdc6b70f1c",
        "region": "europe-west1",
    },
    py_file="gs://airflow-training-knab-asv/dataflow_job.py",
    dag=dag,
)

for currency in {'EUR', 'USD'}:
    currency_task = HttpToGcsOperator(
        task_id="get_currency_" + currency,
        method="GET",
        endpoint="airflow-training-transform-valutas?date={{ ds }}&from=GBP&to="
        + currency,
        http_conn_id="airflow-training-currency-http",
        gcs_conn_id="airflow-training-storage-bucket",
        gcs_path="currency/{{ ds }}-" + currency + ".json",
        dag=dag,

Exemplo n.º 23

0

Exibir arquivo

 
from airflow import models
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
from airflow.operators import BashOperator
yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())
 
 
default_args = {
    'start_date':yesterday
}
 
with models.DAG(
    'dataflow_python_gcp_conn_id',
    schedule_interval=None,
    default_args=default_args) as dag:
    
    bash_nothing = BashOperator(task_id='nothing_2',bash_command='echo nothing')
    
    run_dataflow_python = DataFlowPythonOperator(
		    task_id='df-conn-gcp-id-from-json',
		    py_file='/home/airflow/gcs/data/wordcount.py',
		    options={'runner':'DataflowRunner',
			     'output':'gs://staging-bucket-hijo-project/out',
			     'temp_location':'gs://staging-bucket-hijo-project/teemp',
			     'staging_location':'gs://staging-bucket-hijo-project/staging',
			     'project':'hijo-project'},
		    gcp_conn_id='cloud-dataflow-hijo-project-from-location')
    bash_nothing >> run_dataflow_python

Exemplo n.º 24

0

Exibir arquivo

Arquivo: mlengine_operator_utils.py Projeto: arihantsurana/incubator-airflow

def create_evaluate_ops(task_prefix,
                        data_format,
                        input_paths,
                        prediction_path,
                        metric_fn_and_keys,
                        validate_fn,
                        batch_prediction_job_id=None,
                        project_id=None,
                        region=None,
                        dataflow_options=None,
                        model_uri=None,
                        model_name=None,
                        version_name=None,
                        dag=None):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling MLEngineBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.
    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    def get_metric_fn_and_keys():
        import math  # imports should be outside of the metric_fn below.
        def error_and_squared_error(inst):
            label = float(inst['input_label'])
            classes = float(inst['classes'])  # 0 or 1
            err = abs(classes-label)
            squared_err = math.pow(classes-label, 2)
            return (err, squared_err)  # returns a tuple.
        return error_and_squared_error, ['err', 'mse']  # key order must match.

    def validate_err_and_count(summary):
        if summary['err'] > 0.2:
            raise ValueError('Too high err>0.2; summary=%s' % summary)
        if summary['mse'] > 0.05:
            raise ValueError('Too high mse>0.05; summary=%s' % summary)
        if summary['count'] < 1000:
            raise ValueError('Too few instances<1000; summary=%s' % summary)
        return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to MLEngineBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: string

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: string

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list of strings

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: string

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:
        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.
        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list of strings

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param batch_prediction_job_id: the id to use for the Cloud ML Batch
        prediction job. Passed directly to the MLEngineBatchPredictionOperator as
        the job_id argument.
    :type batch_prediction_job_id: string

    :param project_id: the Google Cloud Platform project id in which to execute
        Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['project_id']` will be used.
    :type project_id: string

    :param region: the Google Cloud Platform region in which to execute Cloud ML
        Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['region']` will be used.
    :type region: string

    :param dataflow_options: options to run Dataflow jobs. If None, then the
        `dag`'s `default_args['dataflow_default_options']` will be used.
    :type dataflow_options: dictionary

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See MLEngineBatchPredictionOperator for
        more detail.
    :type model_uri: string

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See MLEngineBatchPredictionOperator for more detail. If None,
        then the `dag`'s `default_args['model_name']` will be used.
    :type model_name: string

    :param version_name: Used to indicate a model version to use for prediciton,
        in combination with model_name. Cannot be used together with model_uri.
        See MLEngineBatchPredictionOperator for more detail. If None, then the
        `dag`'s `default_args['version_name']` will be used.
    :type version_name: string

    :param dag: The `DAG` to use for all Operators.
    :type dag: airflow.DAG

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix)

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    if dag is not None and dag.default_args is not None:
        default_args = dag.default_args
        project_id = project_id or default_args.get('project_id')
        region = region or default_args.get('region')
        model_name = model_name or default_args.get('model_name')
        version_name = version_name or default_args.get('version_name')
        dataflow_options = dataflow_options or \
            default_args.get('dataflow_default_options')

    evaluate_prediction = MLEngineBatchPredictionOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=batch_prediction_job_id,
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag)

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True))
    evaluate_summary = DataFlowPythonOperator(
        task_id=(task_prefix + "-summary"),
        py_options=["-m"],
        py_file="airflow.contrib.operators.mlengine_prediction_summary",
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys)
        },
        dag=dag)
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, **kwargs):
        prediction_path = kwargs["templates_dict"]["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError("Wrong format prediction_path: %s",
                             prediction_path)
        summary = os.path.join(obj.strip("/"),
                               "prediction.summary.json")
        gcs_hook = GoogleCloudStorageHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        provide_context=True,
        templates_dict={"prediction_path": prediction_path},
        dag=dag)
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation

Exemplo n.º 25

0

Exibir arquivo

                schedule_interval=None,
                default_args=default_dag_args) as dag:

    create_locations = BashOperator(
        task_id='create_locations',
        bash_command=
        'bq query --use_legacy_sql=false "call covid19_confirmed.create_locations()"'
    )

    transpose_cases = DataFlowPythonOperator(
        task_id='transpose_cases',
        py_file='/home/airflow/gcs/data/transpose_cases.py',
        gcp_conn_id='google_cloud_default',
        options={
            "num-workers": '3',
            "jobname": 'transpose'
        },
        dataflow_default_options={
            "project": 'data-lake-290221',
            "staging_location": 'gs://dataflow-log-data/staging/',
            "temp_location": 'gs://dataflow-log-data/temp/',
            "region": 'us-central1'
        })

    create_aggregations = BashOperator(
        task_id='create_aggregations',
        bash_command=
        'bq query --use_legacy_sql=false "call covid19_confirmed.create_aggregations()"'
    )

    create_locations >> transpose_cases >> create_aggregations

Exemplo n.º 26

0

Exibir arquivo

    task_id='registered_businesses_gcs',
    image='gcr.io/data-rivers/pgh-finance',
    api_version='auto',
    auto_remove=True,
    environment={
        'ISAT_UN': os.environ['ISAT_UN'],
        'ISAT_PW': os.environ['ISAT_PW'],
        'GCS_AUTH_FILE': '/root/finance-open-data/data-rivers-service-acct.json'
    },
    dag=dag
)


dataflow_task = DataFlowPythonOperator(
    task_id='registered_businesses_dataflow',
    job_name='registered-businesses-dataflow_scripts',
    py_file=(os.getcwd() + '/airflow_scripts/dags/dependencies/dataflow_scripts/registered_businesses_dataflow.py'),
    dag=dag
)


# bq_insert = GoogleCloudStorageToBigQueryOperator(
#     task_id='registered_businesses_bq_insert',
#     destination_project_dataset_table='{}:registered_businesses.registered_businesses'.format(os.environ['GCP_PROJECT']),
#     bucket='{}_finance'.format(os.environ['GCS_PREFIX']),
#     source_objects=["avro_output/{}/{}/{}/*.avro".format(dt.strftime('%Y'),
#                                                          dt.strftime('%m').lower(),
#                                                          dt.strftime("%Y-%m-%d"))],
#     write_disposition='WRITE_APPEND',
#     source_format='AVRO',
#     time_partitioning={'type': 'DAY'},
#     dag=dag

Exemplo n.º 27

0

Exibir arquivo

    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    dag=dag,
    project_id=PROJECT_ID,
    trigger_rule=TriggerRule.ALL_DONE,
)

flow_to_bq = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-eb47dfd7557212651320890d28",
        "region": "europe-west1",
    },
    py_file="gs://airflow-training-arjan/dataflow_job.py",
    dag=dag,
)

gcs_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="write_to_bq",
    bucket=BUCKET,
    source_objects=["average_prices/transfer_date={{ ds }}/*.parquet"],
    destination_project_dataset_table=PROJECT_ID +
    ":prices.land_registry_price${{ ds_nodash }}",
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: load-stocks-airflow-job.py Projeto: abhishekcodework/data-pipeline-sample

import datetime

from airflow import models
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
from airflow.operators import BashOperator
yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

default_args = {'start_date': yesterday}

with models.DAG('dataflow_python_dag',
                schedule_interval=None,
                default_args=default_args) as dag:

    bash_nothing = BashOperator(task_id='nothing_2',
                                bash_command='echo nothing')

    run_dataflow_python = DataFlowPythonOperator(
        task_id='dataflow_python_task',
        py_file='/home/datatalkswithsadeeq/stocks-project/wordcount.py',
        options={
            'runner': 'DataflowRunner',
            'output': 'gs://stocks-project-2/out1',
            'temp_location': 'gs://stocks-project-2/temp1',
            'staging_location': 'gs://stocks-project-2/staging1',
            'project': 'stocks-project-2'
        },
        gcp_conn_id='cloud-dataflow-hijo-project-from-location')
    bash_nothing >> run_dataflow_python

Exemplo n.º 29

0

Exibir arquivo

                                                        cluster_name="analyse-pricing-{{ ds }}",
                                                        project_id='airflowbolcom-may2829-aaadbb22',
                                                        dag=dag)

gcstobq = GoogleCloudStorageToBigQueryOperator(task_id="gcs_to_bq",
                                               bucket="bvb-data",
                                               source_objects=["output_file_{{ ds }}/part-*"],
                                               destination_project_dataset_table="airflowbolcom-may2829-aaadbb22:prices.land_registry_price${{ ds_nodash }}",
                                               source_format="PARQUET",
                                               write_disposition="WRITE_TRUNCATE",
                                               autodetect=True,
                                               dag=dag)

raw_into_bigquery = DataFlowPythonOperator(task_id="raw_into_bigquery",
                                           dataflow_default_options={"project": 'airflowbolcom-may2829-aaadbb22',
                                                                     "region": "europe-west1",
                                                                     "staging_location": "gs://bvb-data/stg",
                                                                     "temp_location": "gs://bvb-data/tmp"},
                                           py_file="gs://europe-west1-training-airfl-4ecc4ae4-bucket/dataflow_job.py",
                                           options={'input': "gs://bvb-data/daily_load_{{ ds }}",
                                                    'table': "land_registry_price_{{ ds_nodash }}",
                                                    'dataset': "raw_data"},
                                           dag=dag)


pgsl_to_gcs >> dataproc_create_cluster
http_to_gcs >> dataproc_create_cluster
dataproc_create_cluster >> compute_aggregates >> dataproc_delete_cluster
compute_aggregates >> gcstobq
pgsl_to_gcs >> raw_into_bigquery

Exemplo n.º 30

0

Exibir arquivo

        for question in api_dict:
            final_json += str(question) + '\n'

        final_json = final_json.replace("True",
                                        "true").replace("False", "false")

        json_comments = open(path, "a")
        json_comments.write(final_json)
        json_comments.close()

    # Questions metadata

    APITags = DataFlowPythonOperator(py_file=pipeline_api_tags,
                                     options={
                                         'input': tags_path,
                                         'temp_location': temp_bucket,
                                         'project': project
                                     },
                                     task_id='apicallpipeline')

    # Comments and Answers reports

    sql = 'SELECT question_id FROM `{0}.{1}` WHERE creation_date >= TIMESTAMP("{2}")'.format(
        dataset, table_question, yesterday_dash_string)

    Query = python_operator.PythonOperator(task_id='Query',
                                           python_callable=QueryToGCS,
                                           op_kwargs={'sql': sql})

    CommentsExport = python_operator.PythonOperator(
        task_id='CommentsExport', python_callable=CommentsToGCS)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: house_airflow_pipeline.py Projeto: cmattheson6/poliviews

#     task_id='house_members_df',
#     python_callable=df.house_members_df.main(), ### Needs to change, need --setup_file= and --experiments=allow_non_updatable_job_parameter
#     retries=2,
#     dag=dag
# )

# t3 = BashOperator(
#     task_id='df_v2',
#     bash_command='cd ../house_members_dataflow \
#     py -m house_members_df --setup_file=./setup.py --experiments=allow_non_updatable_job_parameter',
#     dag=dag
# )

t3 = DataFlowPythonOperator(
    task_id='df_v3',
    py_file='../house_members_dataflow/house_members_df.py',
    py_options=[
        '--setup_file={0}'.format('../house_members_dataflow/setup.py'),
        '--experiments=allow_non_updatable_job_parameter'
    ],
    dag=dag)

# If successful, clean out temp CSV files

t4 = BashOperator(task_id='clean_up',
                  bash_command='gsutil rm {0}'.format(gcs_path),
                  dag=dag)

# Build task pipeline order

t1 >> t2 >> t3 >> t4

Exemplo n.º 32

0

Exibir arquivo

Arquivo: sandvik-nlp-dag.py Projeto: disdi/template_etl_pipeline

    'gcp_conn_id': CONNECTION_ID,
    'google_cloud_conn_id': CONNECTION_ID,
    'write_disposition': 'WRITE_TRUNCATE',
    'allow_large_results': True,
}

with DAG('sandvik_template',
         schedule_interval=timedelta(days=1),
         default_args=default_args) as dag:

    dataflow_test = DataFlowPythonOperator(
        task_id='dataflow-test',
        py_file=Variable.get('DATAFLOW_WRAPPER_STUB'),
        options=dict(
            command='{docker_run} {docker_image} dataflow_test'.format(
                **config),
            startup_log_file=pp.join(Variable.get('DATAFLOW_WRAPPER_LOG_PATH'),
                                     'sandvik_nlp/dataflow_test.log'),
            dest='gs://{sandvikline_bucket}/{output_path}'.format(**config),
            project=config['project_id'],
            runner='DataflowRunner',
            temp_location='gs://{temp_bucket}/dataflow_temp'.format(**config),
            staging_location='gs://{temp_bucket}/dataflow_staging'.format(
                **config),
            disk_size_gb="50",
            max_num_workers="4",
            requirements_file='./requirements.txt',
            setup_file='./setup.py',
        ),
        dag=dag)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: avocadoDataflowThroughAirflow.py Projeto: atikhomirova/prtlCldl

# --------------------------------------------------------------------------------

yesterday = datetime.datetime.now() - datetime.timedelta(days=1)

default_args = {
    'owner': 'airflow',
    'start_date': yesterday,
    'depends_on_past': False,
    'email': [''],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'dataflow_default_options': {
        "project": 'micro-store-218714',
        "zone": 'us-east1-b',
        "stagingLocation": 'gs://baketto1/staging'}
}

dag = DAG(
    dag_id='juliaset', 
    default_args=default_args, 
    schedule_interval=None
    )

task1 = DataFlowPythonOperator(
    task_id='trigger_dataflow_from_airflow',
    py_file='gs://baketto1/juliaset/juliaset_main.py',
    dag=dag
)

Exemplo n.º 34

0

Exibir arquivo

)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-05b583b94256b6965bb8c8119a",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

dataflow_job = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-05b583b94256b6965bb8c8119a",
        "region": "europe-west1",
    },
    py_file="gs://airflow_training/other/dataflow_job.py",
    dag=dag,
)

for currency in {"EUR", "USD"}:
    http_to_gcs = HttpToGcsOperator(
        task_id="get_currency_" + currency,
        method="GET",
        endpoint=("airflow-training-transform-valutas?date={{ ds }}&"
                  "from=GBP&to=" + currency),
        bucket="airflow_training",
        http_conn_id="airflow-training-currency-http",
        gcs_conn_id="airflow-training-storage-bucket",
        gcs_path="currency/{{ ds }}-" + currency + ".json",