Пример #1
0
    def setUp(self):
        self.key = "test_dag_id"

        task = DummyOperator(task_id='dummy',
                             dag=models.DAG(dag_id=self.key,
                                            default_args={'start_date': days_ago(2)}),
                             owner='airflow')

        d = days_ago(1)
        with create_session() as session:
            session.add(DM(dag_id=self.key))
            session.add(DR(dag_id=self.key))
            session.add(TI(task=task,
                           execution_date=d,
                           state=State.SUCCESS))
            # flush to ensure task instance if written before
            # task reschedule because of FK constraint
            session.flush()
            session.add(LOG(dag_id=self.key, task_id=None, task_instance=None,
                            execution_date=d, event="varimport"))
            session.add(TF(task=task, execution_date=d,
                           start_date=d, end_date=d))
            session.add(TR(task=task, execution_date=d,
                           start_date=d, end_date=d,
                           try_number=1, reschedule_date=d))
Пример #2
0
 def setUpClass(cls):
     dagbag = models.DagBag(include_examples=True)
     cls.dag1 = dagbag.dags['example_bash_operator']
     cls.dag1.sync_to_db()
     cls.dag2 = dagbag.dags['example_subdag_operator']
     cls.dag2.sync_to_db()
     cls.execution_dates = [days_ago(2), days_ago(1)]
Пример #3
0
    def setUp(self):
        self.dagbag = models.DagBag(include_examples=True)
        self.dag1 = self.dagbag.dags['example_bash_operator']
        self.dag2 = self.dagbag.dags['example_subdag_operator']

        self.execution_dates = [days_ago(2), days_ago(1), days_ago(0)]

        self.session = Session()
Пример #4
0
    def setUp(self):
        self.session = settings.Session()
        self.key = "test_dag_id"

        task = DummyOperator(task_id='dummy',
                             dag=models.DAG(dag_id=self.key,
                                            default_args={'start_date': days_ago(2)}),
                             owner='airflow')

        self.session.add(DM(dag_id=self.key))
        self.session.add(DR(dag_id=self.key))
        self.session.add(TI(task=task,
                            execution_date=days_ago(1),
                            state=State.SUCCESS))
        self.session.add(LOG(dag_id=self.key, task_id=None, task_instance=None,
                             execution_date=days_ago(1), event="varimport"))

        self.session.commit()
Пример #5
0
    def setUp(self):
        self.dagbag = models.DagBag(include_examples=True)
        self.dag1 = self.dagbag.dags['example_bash_operator']
        self.dag2 = self.dagbag.dags['example_subdag_operator']

        self.execution_dates = [days_ago(2), days_ago(1)]

        drs = _create_dagruns(self.dag1, self.execution_dates,
                              state=State.RUNNING,
                              run_id_template="scheduled__{}")
        for dr in drs:
            dr.dag = self.dag1
            dr.verify_integrity()

        drs = _create_dagruns(self.dag2,
                              [self.dag2.default_args['start_date']],
                              state=State.RUNNING,
                              run_id_template="scheduled__{}")

        for dr in drs:
            dr.dag = self.dag2
            dr.verify_integrity()
Пример #6
0
    def test_days_ago(self):
        today = pendulum.today()
        today_midnight = pendulum.instance(datetime.fromordinal(today.date().toordinal()))

        self.assertTrue(dates.days_ago(0) == today_midnight)

        self.assertTrue(dates.days_ago(100) == today_midnight + timedelta(days=-100))

        self.assertTrue(dates.days_ago(0, hour=3) == today_midnight + timedelta(hours=3))
        self.assertTrue(dates.days_ago(0, minute=3) == today_midnight + timedelta(minutes=3))
        self.assertTrue(dates.days_ago(0, second=3) == today_midnight + timedelta(seconds=3))
        self.assertTrue(dates.days_ago(0, microsecond=3) == today_midnight + timedelta(microseconds=3))
# Example dataset
DATASET = {
    "display_name": "test_video_dataset",
    "video_classification_dataset_metadata": {},
}

IMPORT_INPUT_CONFIG = {"gcs_source": {"input_uris": [GCP_AUTOML_VIDEO_BUCKET]}}

extract_object_id = CloudAutoMLHook.extract_object_id

# Example DAG for AutoML Video Intelligence Classification
with models.DAG(
        "example_automl_video",
        schedule_interval=None,  # Override to match your needs
        start_date=days_ago(1),
        user_defined_macros={"extract_object_id": extract_object_id},
        tags=['example'],
) as example_dag:
    create_dataset_task = AutoMLCreateDatasetOperator(
        task_id="create_dataset_task",
        dataset=DATASET,
        location=GCP_AUTOML_LOCATION)

    dataset_id = create_dataset_task.output["dataset_id"]

    import_dataset_task = AutoMLImportDataOperator(
        task_id="import_dataset_task",
        dataset_id=dataset_id,
        location=GCP_AUTOML_LOCATION,
        input_config=IMPORT_INPUT_CONFIG,
    # 'wait_for_downstream': False,
    # 'dag': dag,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'sla_miss_callback': yet_another_function,
    # 'trigger_rule': 'all_success'
}
dag = DAG(
    'blog_example',
    default_args=default_args,
    description='Example dag for airflow blog',
    schedule_interval="0 10 * * *",
    start_date=days_ago(2),
    tags=['example'],
)

p1 = PythonOperator(
    task_id='spider',
    python_callable=spider,
    dag=dag,
)

p2 = PythonOperator(
    task_id='read_db',
    python_callable=read_db,
    dag=dag,
)
Пример #9
0
from __future__ import print_function

import xarray as xa
import codecs, pickle, time
from builtins import range
from pprint import pprint
import airflow
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago

args = {
    'owner': 'airflow',
    'start_date': days_ago(2),
}

dag = DAG(
    dag_id='ILTest-xarray1',
    default_args=args,
    schedule_interval=None,
)


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'


op_print_context = PythonOperator(
    task_id='print_the_context',
Пример #10
0
    failed_alert = MessageOperator(
        task_id="failed_alert",
        http_conn_id="slack",
        webhook_token=slack_webhook_token,
        message=f"{cleandoc(message)}\n\n{formatted_exception}",
        username="******",
    )
    return failed_alert.execute(context=context)


# set the local time zone, so the start_date DAG param can use it in its context
# as stated in the Airflow docs, pendulum must be used to set the timezone
amsterdam = pendulum.timezone("Europe/Amsterdam")

# set start_date to 'yesterday', and get the year, month and day as seperate integer values
start_date_dag = str(days_ago(1))
YYYY = 0
MM = 0
DD = 0

# # extract the YYYY MM and DD values as integers
get_YYYY_MM_DD_values = re.search("([0-9]{4})-([0-9]{2})-([0-9]{2})",
                                  start_date_dag)
if get_YYYY_MM_DD_values:
    YYYY = int(get_YYYY_MM_DD_values.group(1))
    MM = int(get_YYYY_MM_DD_values.group(2))
    DD = int(get_YYYY_MM_DD_values.group(3))

default_args = {
    "owner": "dataservices",
    "depends_on_past": False,
Пример #11
0
import os
import datetime

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'email': os.environ['FAILURE_EMAIL'],
    'start_date': days_ago(0),
    'email_on_failure': True,
}

dag = DAG(dag_id='games',
          default_args=default_args,
          schedule_interval="* * * * *")

t1 = BashOperator(task_id='sklearn_pipeline',
                  bash_command='sudo docker run sklearn_pipeline',
                  dag=dag)
Пример #12
0
TODO: Review the workflow, change it accordingly to
      your environment & enable the code.
"""

from datetime import timedelta

from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.python import ShortCircuitOperator
from airflow.providers.docker.operators.docker import DockerOperator
from airflow.utils.dates import days_ago

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": days_ago(2),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
}

dag = DAG("docker_sample_copy_data",
          default_args=default_args,
          schedule_interval=timedelta(minutes=10))

locate_file_cmd = """
    sleep 10
    find {{params.source_location}} -type f  -printf "%f\n" | head -1
"""
Пример #13
0
from datetime import timedelta

from airflow import DAG
from airflow.utils.dates import days_ago

from dag_test_examples import t_A, t_B

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": days_ago(2),
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    "dbnd_config": {
        "databand": {
            "env": "gcp"
        }
    },
}

with DAG(dag_id="dbnd_dag_at_gcp", default_args=default_args) as dag_remote_fs:
    a = t_A()
    b = t_B(a)

if __name__ == "__main__":
    dag_remote_fs.clear()
    dag_remote_fs.run(start_date=days_ago(0), end_date=days_ago(0))
Пример #14
0
from airflow.providers.docker.operators.docker import DockerOperator
from airflow.utils.dates import days_ago
DATA_PATH = "/Users/mariapopova/Documents/GitHub/chydlife/airflow_ml_dags/data:/data"

default_args = {
    "owner": "airflow",
    "email": ["*****@*****.**"],
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
}

with DAG(
        "download-train-validate",
        default_args=default_args,
        schedule_interval="@daily",
        start_date=days_ago(5),
) as dag:
    download = DockerOperator(
        image="airflow-download",
        command="/data/raw/{{ ds }}",
        network_mode="bridge",
        task_id="docker-airflow-download",
        do_xcom_push=False,
        # !!! HOST folder(NOT IN CONTAINER) replace with yours !!!
        volumes=[DATA_PATH])

    preprocess = DockerOperator(
        image="airflow-preprocess",
        command=
        "--input-dir /data/raw/{{ ds }} --output-dir /data/processed/{{ ds }}",
        task_id="docker-airflow-preprocess",
Пример #15
0
from airflow import DAG
from airflow.models import Variable
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.contrib.operators.bigquery_operator import BigQueryOperator

from airflow.utils.dates import days_ago

PROJECT_ID = Variable.get("project")
LANDING_BUCKET = Variable.get("landing_bucket")
BACKUP_BUCKET = Variable.get("backup_bucket")

default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)}


def list_objects(bucket=None):
    hook = GoogleCloudStorageHook()
    storage_objects = hook.list(bucket)

    return storage_objects


def move_objects(source_bucket=None, destination_bucket=None, prefix=None, **kwargs):

    storage_objects = kwargs["ti"].xcom_pull(task_ids="list_files")

    hook = GoogleCloudStorageHook()

    for storage_object in storage_objects:
        destination_object = storage_object
DEFAULT_ARGS = {"owner": "airflow"}


class GetRequestOperator(BaseOperator):
    """Custom operator to sand GET request to provided url"""

    def __init__(self, *, url: str, **kwargs):
        super().__init__(**kwargs)
        self.url = url

    def execute(self, context):
        return requests.get(self.url).json()


# [START dag_decorator_usage]
@dag(default_args=DEFAULT_ARGS, schedule_interval=None, start_date=days_ago(2))
def example_dag_decorator(email: str = '*****@*****.**'):
    """
    DAG to send server IP to email.

    :param email: Email to send IP to. Defaults to [email protected].
    :type email: str
    """
    get_ip = GetRequestOperator(task_id='get_ip', url="http://httpbin.org/get")

    @task(multiple_outputs=True)
    def prepare_email(raw_json: Dict[str, Any]) -> Dict[str, str]:
        external_ip = raw_json['origin']
        return {
            'subject': f'Server connected from {external_ip}',
            'body': f'Seems like today your server executing Airflow is connected from IP {external_ip}<br>',
Пример #17
0
class DummySkipOperator(DummyOperator):
    """Dummy operator which always skips the task."""

    ui_color = '#e8b7e4'

    def execute(self, context):
        raise AirflowSkipException


def create_test_pipeline(suffix, trigger_rule, dag_):
    """
    Instantiate a number of operators for the given DAG.

    :param str suffix: Suffix to append to the operator task_ids
    :param str trigger_rule: TriggerRule for the join task
    :param DAG dag_: The DAG to run the operators on
    """
    skip_operator = DummySkipOperator(task_id=f'skip_operator_{suffix}', dag=dag_)
    always_true = DummyOperator(task_id=f'always_true_{suffix}', dag=dag_)
    join = DummyOperator(task_id=trigger_rule, dag=dag_, trigger_rule=trigger_rule)
    final = DummyOperator(task_id=f'final_{suffix}', dag=dag_)

    skip_operator >> join
    always_true >> join
    join >> final


dag = DAG(dag_id='example_skip_dag', default_args=args, start_date=days_ago(2), tags=['example'])
create_test_pipeline('1', 'all_success', dag)
create_test_pipeline('2', 'one_success', dag)
Пример #18
0
def test_lineage_backend(mock_emit, inlets, outlets):
    DEFAULT_DATE = days_ago(2)
    mock_emitter = Mock()
    mock_emit.return_value = mock_emitter
    # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6.
    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub_provider.lineage.datahub.DatahubLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
            "AIRFLOW__LINEAGE__DATAHUB_KWARGS":
            json.dumps({
                "graceful_exceptions": False,
                "capture_executions": False
            }),
        },
    ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch(
            "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection(
                datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1_upstream",
                inlets=inlets,
                outlets=outlets,
            )
            op2 = DummyOperator(
                task_id="task2",
                inlets=inlets,
                outlets=outlets,
            )
            op1 >> op2

        # Airflow < 2.2 requires the execution_date parameter. Newer Airflow
        # versions do not require it, but will attempt to find the associated
        # run_id in the database if execution_date is provided. As such, we
        # must fake the run_id parameter for newer Airflow versions.
        if AIRFLOW_VERSION < packaging.version.parse("2.2.0"):
            ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE)
        else:
            ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}")
        ctx1 = {
            "dag": dag,
            "task": op2,
            "ti": ti,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op2, ctx1)
        post = apply_lineage(func)
        post(op2, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op2.inlets) == 1
        assert len(op2.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op2.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op2.outlets))

        # Check that the right things were emitted.
        assert mock_emitter.emit.call_count == 9
        # Running further checks based on python version because args only exists in python 3.7+
        if sys.version_info[:3] > (3, 7):
            assert mock_emitter.method_calls[0].args[
                0].aspectName == "dataFlowInfo"
            assert (
                mock_emitter.method_calls[0].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[1].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[1].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[2].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[2].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[3].args[
                0].aspectName == "dataJobInfo"
            assert (
                mock_emitter.method_calls[3].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[4].args[0].aspectName ==
                    "dataJobInputOutput")
            assert (
                mock_emitter.method_calls[4].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0]
                ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[5].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[5].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )

            assert mock_emitter.method_calls[6].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[6].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[7].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[7].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert mock_emitter.method_calls[8].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[8].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )
Пример #19
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START composer_grouping_airflow_1]

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.dates import days_ago

DAG_NAME = 'all_tasks_in_one_dag'

args = {'owner': 'airflow', 'start_date': days_ago(1), 'schedule_interval': "@once"}

with DAG(dag_id=DAG_NAME, default_args=args) as dag:

    start = DummyOperator(
        task_id='start'
    )

    task_1 = BashOperator(
        task_id='op-1',
        bash_command=':',
        dag=dag)

    task_2 = BashOperator(
        task_id='op-2',
        bash_command=':',
Пример #20
0
    # 'wait_for_downstream': False,
    # 'dag': dag,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'sla_miss_callback': yet_another_function,
    # 'trigger_rule': 'all_success'
}
with DAG(
        'wf_aws_driver',
        default_args=default_args,
        description='A simple test trigger another DAG',
        schedule_interval=None,
        start_date=days_ago(0),
        tags=['test'],
) as dag:
    dag.doc_md = dedent("""\
    Тестовый WF с вызовом другого WS
    """)

    aws_test2 = LivyOperator(
        task_id='aws_test2',
        dag=dag,
        livy_conn_id='livy_default',
        file=
        's3a://datagram/user/root/deployments/autogenerated_tr_aws_test_2/ru.neoflex.meta.etl2.spark.aws_test_2-1.0'
        '-SNAPSHOT.jar',
        proxy_user='******',
        args=[
Пример #21
0
GCS_STAGING = os.environ.get('GCP_DATAFLOW_GCS_STAGING',
                             'gs://test-dataflow-example/staging/')
GCS_OUTPUT = os.environ.get('GCP_DATAFLOW_GCS_OUTPUT',
                            'gs://test-dataflow-example/output')
GCS_JAR = os.environ.get(
    'GCP_DATAFLOW_JAR',
    'gs://test-dataflow-example/word-count-beam-bundled-0.1.jar')
GCS_PYTHON = os.environ.get(
    'GCP_DATAFLOW_PYTHON', 'gs://test-dataflow-example/wordcount_debugging.py')

GCS_JAR_PARTS = urlparse(GCS_JAR)
GCS_JAR_BUCKET_NAME = GCS_JAR_PARTS.netloc
GCS_JAR_OBJECT_NAME = GCS_JAR_PARTS.path[1:]

default_args = {
    "start_date": days_ago(1),
    'dataflow_default_options': {
        'tempLocation': GCS_TMP,
        'stagingLocation': GCS_STAGING,
    }
}

with models.DAG(
        "example_gcp_dataflow_native_java",
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
        tags=['example'],
) as dag_native_java:

    # [START howto_operator_start_java_job]
    start_java_job = DataflowCreateJavaJobOperator(
Пример #22
0
    def __init__(
            self,
            dag,
            name,
            image=None,
            # Directories
            operator_out_dir=None,
            input_operator=None,
            # Airflow
            task_id=None,
            parallel_id=None,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            ram_mem_mb=500,
            ram_mem_mb_lmt=None,
            cpu_millicores=None,
            cpu_millicores_lmt=None,
            gpu_mem_mb=None,
            gpu_mem_mb_lmt=None,
            retries=1,
            retry_delay=timedelta(seconds=60),
            priority_weight=1,
            execution_timeout=timedelta(minutes=90),
            task_concurrency=None,
            manage_cache=None,
            # Other stuff
            cmds=None,
            arguments=None,
            env_vars=None,
            image_pull_secrets=None,
            startup_timeout_seconds=120,
            namespace='flow-jobs',
            image_pull_policy=os.getenv('PULL_POLICY_PODS', 'IfNotPresent'),
            training_operator=False,
            volume_mounts=None,
            volumes=None,
            pod_resources=None,
            enable_proxy=False,
            host_network=False,
            in_cluster=False,
            cluster_context=None,
            labels=None,
            get_logs=True,
            annotations=None,
            affinity=None,
            config_file=None,
            xcom_push=False,
            node_selectors=None,
            secrets=None,
            kind="Pod",
            pool=None,
            pool_slots=None,
            api_version="v1",
            *args,
            **kwargs):

        KaapanaBaseOperator.set_defaults(self,
                                         name=name,
                                         task_id=task_id,
                                         operator_out_dir=operator_out_dir,
                                         input_operator=input_operator,
                                         parallel_id=parallel_id,
                                         trigger_rule=trigger_rule,
                                         pool=pool,
                                         pool_slots=pool_slots,
                                         ram_mem_mb=ram_mem_mb,
                                         ram_mem_mb_lmt=ram_mem_mb_lmt,
                                         cpu_millicores=cpu_millicores,
                                         cpu_millicores_lmt=cpu_millicores_lmt,
                                         gpu_mem_mb=gpu_mem_mb,
                                         gpu_mem_mb_lmt=gpu_mem_mb_lmt,
                                         manage_cache=manage_cache)

        # Airflow
        self.retries = retries
        self.priority_weight = priority_weight
        self.execution_timeout = execution_timeout
        self.task_concurrency = task_concurrency
        self.retry_delay = retry_delay

        self.training_operator = training_operator

        # Kubernetes
        self.image = image
        self.env_vars = env_vars or {}
        self.namespace = namespace
        self.cmds = cmds or []
        self.arguments = arguments or []
        self.labels = labels or {}
        self.startup_timeout_seconds = startup_timeout_seconds
        self.volume_mounts = volume_mounts or []
        self.volumes = volumes or []
        self.image_pull_secrets = image_pull_secrets or []
        self.in_cluster = in_cluster
        self.cluster_context = cluster_context
        self.get_logs = get_logs
        self.image_pull_policy = image_pull_policy
        self.node_selectors = node_selectors or {}
        self.annotations = annotations or {}
        self.affinity = affinity or {}
        self.xcom_push = xcom_push
        self.pod_resources = pod_resources or None
        self.config_file = config_file
        self.api_version = api_version
        self.secrets = secrets
        self.kind = kind
        self.data_dir = os.getenv('DATADIR', "")
        self.result_message = None
        self.host_network = host_network
        self.enable_proxy = enable_proxy

        self.volume_mounts.append(
            VolumeMount('dcmdata',
                        mount_path='/data',
                        sub_path=None,
                        read_only=False))
        volume_config = {
            'hostPath': {
                'type': 'DirectoryOrCreate',
                'path': self.data_dir
            }
        }
        self.volumes.append(Volume(name='dcmdata', configs=volume_config))

        if self.training_operator:
            self.volume_mounts.append(
                VolumeMount('tensorboard',
                            mount_path='/tensorboard',
                            sub_path=None,
                            read_only=False))
            tb_config = {
                'hostPath': {
                    'type': 'DirectoryOrCreate',
                    'path': os.path.join(self.data_dir, "tensorboard")
                }
            }
            self.volumes.append(Volume(name='tensorboard', configs=tb_config))

        if self.pod_resources is None:
            pod_resources = PodResources(
                request_cpu="{}m".format(self.cpu_millicores)
                if self.cpu_millicores != None else None,
                limit_cpu="{}m".format(self.cpu_millicores + 100)
                if self.cpu_millicores != None else None,
                request_memory="{}Mi".format(self.ram_mem_mb),
                limit_memory="{}Mi".format(
                    self.ram_mem_mb_lmt if self.
                    ram_mem_mb_lmt is not None else self.ram_mem_mb + 100),
                limit_gpu=1 if self.gpu_mem_mb is not None else None)
            self.pod_resources = pod_resources

        envs = {
            "WORKFLOW_DIR": str(WORKFLOW_DIR),
            "BATCH_NAME": str(BATCH_NAME),
            "OPERATOR_OUT_DIR": str(self.operator_out_dir),
            "OPERATOR_IN_DIR": str(self.operator_in_dir),
            "BATCHES_INPUT_DIR": "/{}/{}".format(WORKFLOW_DIR, BATCH_NAME)
        }

        if http_proxy is not None and http_proxy != "" and self.enable_proxy:
            envs.update({
                "http_proxy": http_proxy,
                "https_proxy": http_proxy,
                "HTTP_PROXY": http_proxy,
                "HTTPS_PROXY": http_proxy,
            })

        envs.update(self.env_vars)
        self.env_vars = envs
        super().__init__(dag=dag,
                         task_id=self.task_id,
                         retries=self.retries,
                         priority_weight=self.priority_weight,
                         execution_timeout=self.execution_timeout,
                         task_concurrency=self.task_concurrency,
                         pool=self.pool,
                         pool_slots=self.pool_slots,
                         retry_delay=self.retry_delay,
                         email=None,
                         email_on_retry=True,
                         email_on_failure=True,
                         start_date=days_ago(0),
                         depends_on_past=False,
                         wait_for_downstream=False,
                         trigger_rule=self.trigger_rule,
                         on_failure_callback=KaapanaBaseOperator.on_failure,
                         on_success_callback=KaapanaBaseOperator.on_success,
                         on_retry_callback=KaapanaBaseOperator.on_retry,
                         on_execute_callback=KaapanaBaseOperator.on_execute,
                         executor_config=self.executor_config,
                         *args,
                         **kwargs)
Пример #23
0
from itertools import cycle
from functools import partial
from datetime import timedelta, datetime

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago


from nb_runner import cycle_exp, cycle_mutate, \
                      cycle_crossover, cycle_combine, cycle_all,\
                      bo_exp, bo_all
from config import cfg

d = days_ago(1)  # + timedelta(hours=10, minutes=31)

default_args = {
    'owner': cfg.OWNER,
    'depends_on_past': False,
    #'start_date':d,
    'email': False,
    'email_on_failure': False,
    'email_on_retry': False,
    #'retries': 0,# overrided in pythonOperator down below
    #'retry_delay': timedelta(minutes=5),# overrided in pythonOperator down below
}

default_pool = cfg.DAG.DEF_POOL
#schedule_interval = cfg.DAG.SCHED_INTERVAL if cfg.DAG.SCHED_INTERVAL else None #@daily
description = cfg.DAG.DESC + '\n' + json.dumps(cfg, indent=4)
Пример #24
0
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from datetime import timedelta

default_args = {
    'owner': 'airflow',  # Lo ejecuta el usuario airflow
    'depends_on_past': False,
    'start_date': days_ago(2),  # Comienzo inmediato
    'email':
    ['*****@*****.**'],  # Email al que enviar el informe si hay error.
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=10),
}

dag = DAG(
    dag_id='aadownloaddata',
    default_args=default_args,
    description='descargadedatos',
    dagrun_timeout=timedelta(minutes=2),
    schedule_interval=timedelta(days=1),
)

CreateDir = BashOperator(task_id='create_dir',
                         depends_on_past=False,
                         bash_command='mkdir -p /tmp/airflow/p2/',
                         dag=dag)
Пример #25
0
from airflow.operators.python_operator import PythonOperator
from airflow.settings import Session
from airflow.utils import timezone
from airflow.utils.dates import days_ago, infer_time_unit, round_time, scale_time_units
from airflow.utils.state import State
from airflow.utils.timezone import datetime
from tests.test_utils.config import conf_vars

DEV_NULL = '/dev/null'
TEST_DAG_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'dags')
DEFAULT_DATE = datetime(2015, 1, 1)
DEFAULT_DATE_ISO = DEFAULT_DATE.isoformat()
DEFAULT_DATE_DS = DEFAULT_DATE_ISO[:10]
TEST_DAG_ID = 'unit_tests'
EXAMPLE_DAG_DEFAULT_DATE = days_ago(2)


class OperatorSubclass(BaseOperator):
    """
    An operator to test template substitution
    """
    template_fields = ['some_templated_field']

    def __init__(self, some_templated_field, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.some_templated_field = some_templated_field

    def execute(self, context):
        pass
Пример #26
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from airflow.utils.dates import days_ago
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.models import DAG

log = LoggingMixin().log

try:
    # Kubernetes is optional, so not available in vanilla Airflow
    # pip install apache-airflow[kubernetes]
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {'owner': 'airflow', 'start_date': days_ago(2)}

    dag = DAG(dag_id='example_kubernetes_operator',
              default_args=args,
              schedule_interval=None)

    tolerations = [{'key': "key", 'operator': 'Equal', 'value': 'value'}]

    k = KubernetesPodOperator(namespace='default',
                              image="ubuntu:16.04",
                              cmds=["bash", "-cx"],
                              arguments=["echo", "10"],
                              labels={"foo": "bar"},
                              name="airflow-test-pod",
                              in_cluster=False,
                              task_id="task",
Пример #27
0
GCF_ENTRYPOINT = os.environ.get('GCF_ENTRYPOINT', 'helloWorld')
GCF_RUNTIME = 'nodejs6'
GCP_VALIDATE_BODY = os.environ.get('GCP_VALIDATE_BODY', True)
# [END howto_operator_gcf_deploy_variables]

# [START howto_operator_gcf_deploy_body]
body = {
    "name": FUNCTION_NAME,
    "entryPoint": GCF_ENTRYPOINT,
    "runtime": GCF_RUNTIME,
    "httpsTrigger": {}
}
# [END howto_operator_gcf_deploy_body]

# [START howto_operator_gcf_default_args]
default_args = {'start_date': dates.days_ago(1)}
# [END howto_operator_gcf_default_args]

# [START howto_operator_gcf_deploy_variants]
if GCF_SOURCE_ARCHIVE_URL:
    body['sourceArchiveUrl'] = GCF_SOURCE_ARCHIVE_URL
elif GCF_SOURCE_REPOSITORY:
    body['sourceRepository'] = {'url': GCF_SOURCE_REPOSITORY}
elif GCF_ZIP_PATH:
    body['sourceUploadUrl'] = ''
    default_args['zip_path'] = GCF_ZIP_PATH
elif GCF_SOURCE_UPLOAD_URL:
    body['sourceUploadUrl'] = GCF_SOURCE_UPLOAD_URL
else:
    raise Exception("Please provide one of the source_code parameters")
# [END howto_operator_gcf_deploy_variants]
Пример #28
0
import sys
sys.path.insert(0, '..')
# from function.make_pdf import pdf_main
# from function.first import main


def tt1(param, **kwargs):
    print('tt1', param)
    # main()


def tt2(param, **kwargs):
    print('tt2', param)


args = {'owner': 'geonho', 'start_date': days_ago(n=1)}

dag = DAG(dag_id='test_20210422',
          default_args=args,
          schedule_interval='@daily')

d1 = PythonOperator(task_id='task1',
                    provide_context=True,
                    python_callable=tt1,
                    op_kwargs={'param': 'apple'},
                    dag=dag)
d2 = PythonOperator(task_id='task2',
                    provide_context=True,
                    python_callable=tt2,
                    op_kwargs={'param': 'apple'},
                    dag=dag)
Пример #29
0
"""

from os import getenv

from airflow import DAG
from airflow.providers.amazon.aws.operators.imap_attachment_to_s3 import ImapAttachmentToS3Operator
from airflow.utils.dates import days_ago

# [START howto_operator_imap_attachment_to_s3_env_variables]
IMAP_ATTACHMENT_NAME = getenv("IMAP_ATTACHMENT_NAME", "test.txt")
IMAP_MAIL_FOLDER = getenv("IMAP_MAIL_FOLDER", "INBOX")
IMAP_MAIL_FILTER = getenv("IMAP_MAIL_FILTER", "All")
S3_DESTINATION_KEY = getenv("S3_DESTINATION_KEY", "s3://bucket/key.json")
# [END howto_operator_imap_attachment_to_s3_env_variables]

default_args = {"start_date": days_ago(1)}

with DAG(dag_id="example_imap_attachment_to_s3",
         default_args=default_args,
         schedule_interval=None,
         tags=['example']) as dag:
    # [START howto_operator_imap_attachment_to_s3_task_1]
    task_transfer_imap_attachment_to_s3 = ImapAttachmentToS3Operator(
        imap_attachment_name=IMAP_ATTACHMENT_NAME,
        s3_key=S3_DESTINATION_KEY,
        imap_mail_folder=IMAP_MAIL_FOLDER,
        imap_mail_filter=IMAP_MAIL_FILTER,
        task_id='transfer_imap_attachment_to_s3',
        dag=dag)
    # [END howto_operator_imap_attachment_to_s3_task_1]
Пример #30
0
        "repoSource": {
            "repoName": GCP_SOURCE_REPOSITORY_NAME,
            "branchName": "master"
        }
    },
    "steps": [{
        "name": "gcr.io/cloud-builders/docker",
        "args": ["build", "-t", "gcr.io/$PROJECT_ID/$REPO_NAME", "."],
    }],
    "images": ["gcr.io/$PROJECT_ID/$REPO_NAME"],
}
# [END howto_operator_create_build_from_repo_body]

with models.DAG(
        "example_gcp_cloud_build",
        default_args=dict(start_date=dates.days_ago(1)),
        schedule_interval=None,
        tags=['example'],
) as dag:
    # [START howto_operator_create_build_from_storage]
    create_build_from_storage = CloudBuildCreateOperator(
        task_id="create_build_from_storage",
        project_id=GCP_PROJECT_ID,
        body=create_build_from_storage_body)
    # [END howto_operator_create_build_from_storage]

    # [START howto_operator_create_build_from_storage_result]
    create_build_from_storage_result = BashOperator(
        bash_command=
        "echo '{{ task_instance.xcom_pull('create_build_from_storage')['images'][0] }}'",
        task_id="create_build_from_storage_result",
Пример #31
0
from marquez_airflow import DAG
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_orders_7_days',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads newly placed orders weekly.')

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries',
                        external_dag_id='new_food_deliveries',
                        mode='reschedule',
                        dag=dag)

# Wait for etl_orders DAG to complete
t2 = ExternalTaskSensor(task_id='wait_for_etl_orders',
                        external_dag_id='etl_orders',
                        mode='reschedule',
                        dag=dag)
        # task 3
        t3 = PythonOperator(task_id='python_write_file',
                            depends_on_past=False,
                            python_callable=WriteToFile,
                            email=['*****@*****.**'],
                            email_on_failure=True,
                            dag=dag_subdag)
        t3_complete = datetime.now()

    return dag_subdag, t2_complete, t3_complete


dag = DAG(
    'sample_sub_dag',
    default_args=default_args,
    start_date=days_ago(1),  #Start date for the workflow is neccesary
    description='A sample workflow',
    schedule_interval=None)

# task 1
command1 = """
echo "Time: $(date)" >> /home/karan/Attempt_ApacheAirflow/t1.log
"""
t1 = BashOperator(task_id='print_date',
                  depends_on_past=False,
                  bash_command=command1,
                  dag=dag)


# task 3 - python function
def WriteToFile():
    },
}  # type: Dict[str, Any]
# [END howto_operator_gcp_transfer_create_job_body_gcp]

# [START howto_operator_gcp_transfer_update_job_body]
update_body = {
    PROJECT_ID: GCP_PROJECT_ID,
    TRANSFER_JOB: {DESCRIPTION: "{}_updated".format(GCP_DESCRIPTION)},
    TRANSFER_JOB_FIELD_MASK: "description",
}
# [END howto_operator_gcp_transfer_update_job_body]

list_filter_dict = {FILTER_PROJECT_ID: GCP_PROJECT_ID, FILTER_JOB_NAMES: []}

# [START howto_operator_gcp_transfer_default_args]
default_args = {'start_date': days_ago(1)}
# [END howto_operator_gcp_transfer_default_args]

with models.DAG(
    'example_gcp_transfer', default_args=default_args, schedule_interval=None  # Override to match your needs
) as dag:

    # [START howto_operator_gcp_transfer_create_job]
    create_transfer_job_from_aws = GcpTransferServiceJobCreateOperator(
        task_id="create_transfer_job_from_aws", body=aws_to_gcs_transfer_body
    )
    # [END howto_operator_gcp_transfer_create_job]

    wait_for_operation_to_start = GCPTransferServiceWaitForJobStatusSensor(
        task_id="wait_for_operation_to_start",
        job_name="{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}",
Пример #34
0
from airflow.models import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python_operator import PythonOperator

# Parâmetros
args = {'owner': 'janilson', 'start_date': days_ago(1)}

# Criação do dag
dag = DAG(dag_id="my_simple_dag", default_args=args, schedule_interval=None)


# Função python para ser executada pelas tarefas
def run_this_func(**context):
    print('hi')


with dag:
    # Tarefa teste 1
    run_this_task = PythonOperator(
        task_id='run_this',
        python_callable=run_this_func,
        provide_context=True,
    )

    # Tarefa teste 2
    run_this_task2 = PythonOperator(
        task_id='run_this2',
        python_callable=run_this_func,
        provide_context=True,
    )
Пример #35
0
 def setUp(self):
     self.dagbag = models.DagBag(include_examples=True)
     self.dag1 = self.dagbag.dags['example_bash_operator']
     self.dag2 = self.dagbag.dags['example_subdag_operator']
     self.execution_dates = [days_ago(2), days_ago(1), days_ago(0)]
Пример #36
0
]
DB_TABLE_SCHEMA = DbTableSchema(
    schema_name=DB_SCHEMA_NAME,
    table_name=DB_TABLE_NAME,
    columns=DB_TABLE_COLUMNS
)
NO_DB_TABLE_SCHEMA = []

SQL = f"SELECT * FROM {DB_NAME}.{DB_TABLE_NAME.name};"

DAG_ID = 'email_discounts'
DAG_OWNER = 'datascience'
DAG_DEFAULT_ARGS = {
    'owner': DAG_OWNER,
    'depends_on_past': False,
    'start_date': days_ago(7),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}
DAG_DESCRIPTION = 'Email discounts to customers that have experienced order delays daily'

DAG = dag = DAG(
    DAG_ID,
    schedule_interval='@weekly',
    default_args=DAG_DEFAULT_ARGS,
    description=DAG_DESCRIPTION
)

TASK_ID = 'select'
TASK = SnowflakeOperator(
from airflow.utils import dates

project = 'your-project-id'  # Change this to your own GCP project_id
topic = 'example-topic'  # Cloud Pub/Sub topic
subscription = 'subscription-to-example-topic'  # Cloud Pub/Sub subscription
# Sample messages to push/pull
messages = [
    {'data': b64encode(b'Hello World')},
    {'data': b64encode(b'Another message')},
    {'data': b64encode(b'A final message')}
]

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': dates.days_ago(2),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'project': project,
    'topic': topic,
    'subscription': subscription,
}


echo_template = '''
{% for m in task_instance.xcom_pull(task_ids='pull-messages') %}
    echo "AckID: {{ m.get('ackId') }}, Base64-Encoded: {{ m.get('message') }}"
{% endfor %}
'''
Пример #38
0
from airflow import DAG
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from airflow.sensors.external_task_sensor import ExternalTaskSensor
from airflow.utils.dates import days_ago

with DAG(dag_id="dag_referenced_task_dag_id_exists_fail",
         schedule_interval=None,
         start_date=days_ago(1)) as dag:
    TriggerDagRunOperator(task_id="test_trigger", trigger_dag_id="nonexistent")
    ExternalTaskSensor(task_id="test_sensor_dag",
                       external_dag_id="nonexistent")
    ExternalTaskSensor(task_id="test_sensor_task",
                       external_dag_id="nonexistent",
                       external_task_id="non-task")