dag = DAG('hello_world3',
          default_args=default_args,
          schedule_interval=None,
          catchup=False)


def print_context():
    print("Hello yoo")
    with open('/storage/hello-world-example/some1.txt', 'a') as a:
        a.write("Executed now2 yo 1" + str(datetime.now()))
    time.sleep(10)
    return 'Whatever you return gets printed in the logs'


def print_context2():
    with open('/storage/hello-world-example/some1.txt', 'a') as a:
        a.write("Executed now2 yo 2" + str(datetime.now()))
    time.sleep(15)
    return 'Whatever you return gets printed in the logs'


t1 = PythonOperator(task_id='print_the_context1',
                    python_callable=print_context,
                    dag=dag)

t2 = PythonOperator(task_id='print_the_context2',
                    python_callable=print_context2,
                    dag=dag)

t2.set_upstream(t1)
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

with DAG(
        'air_quality_index',
        default_args=default_args,
        schedule_interval='@hourly',
) as dag:

    task_1 = PythonOperator(task_id='create_table',
                            provide_context=True,
                            python_callable=create_statement,
                            op_kwargs={
                                'host': 'localhost',
                                'dbname': 'postgres',
                                'user': '******',
                                'password': '******',
                                'statement': create_statment
                            },
                            dag=dag)

    task_2 = PythonOperator(task_id='get_data_from_api',
                            provide_context=True,
                            python_callable=get_data_api,
                            dag=dag)

    task_3 = PythonOperator(task_id='load_json_file',
                            provide_context=True,
                            python_callable=load_json_file,
                            dag=dag)
                   username=conn_conf_dict[conn_id].login,
                   password=conn_conf_dict[conn_id].password,
                   table=table_name,
                   ufile_path=UFILE_PATH % (db_name, table_name),
                   query=query,
                   m=18 if table_name == 'channel_response_code' else 20),
        dag=dag,
    )

    # check table
    check_table = PythonOperator(
        task_id='check_table_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        python_callable=run_check_table,
        provide_context=True,
        op_kwargs={
            'db_name': db_name,
            'table_name': table_name,
            'conn_id': conn_id,
            'hive_table_name': hive_table_name
        },
        dag=dag)
    # add partitions
    add_partitions = HiveOperator(
        task_id='add_partitions_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        hql='''
                ALTER TABLE {table} ADD IF NOT EXISTS PARTITION (dt = '{{{{ tomorrow_ds }}}}',hour = '{{{{ execution_date.strftime("%H") }}}}')
            '''.format(table=hive_table_name),
        schema=HIVE_DB,
        dag=dag)
    from (
        select 1 as star_rating, 'Very bad' as rating_title
        union all
        select 2 as star_rating, 'Bad' as rating_title
        union all
        select 3 as star_rating, 'Not good' as rating_title
        union all
        select 4 as star_rating, 'Good' as rating_title
        union all
        select 5 as star_rating, 'VeryExcellent' as rating_title
    ) r;
    """
    cur.execute(query)
    conn.commit()


create_calendar = PythonOperator(task_id="fill_calendar",
                                 dag=dag,
                                 python_callable=fill_calendar,
                                 op_kwargs={"dwh_conn_id": "dwh"})

create_rating = PythonOperator(task_id="fill_rating",
                               dag=dag,
                               python_callable=fill_rating,
                               op_kwargs={"dwh_conn_id": "dwh"})
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_calendar >> end_operator
start_operator >> create_rating >> end_operator
示例#5
0
    'schedule_interval': '@daily',
    'max_active_runs': 1,
    'retries': 3,
    'retry_delay': timedelta(minutes=1)
}

DAG_ID = 'covid'

with DAG(dag_id=DAG_ID, default_args=DEFAULT_ARGS) as dag:
    with open('/usr/local/airflow/dags/covid/sqls/ddl_covid_timeseries.sql', 'r') as f:
        sql = f.read()
    create_table_if_not_exists = PostgresOperator(task_id='check_table', postgres_conn_id='postgres_staging' ,sql=sql)

    extract = PythonOperator(
        task_id='extract',
        python_callable=get_covid_data,
        op_kwargs={'uri':'https://api.covid19india.org/v4/timeseries.json'},
        provide_context=True
    )

    transform = PythonOperator(
        task_id='transform',
        python_callable=json_to_df,
        op_kwargs={'start_date': '2020-01-01', 
                'end_date': '9999-12-31'},
        provide_context=True
    )

    stage_load = PythonOperator(
        task_id='stage_load',
        python_callable=load_to_stage_area,
        provide_context=True
    'start_date': datetime(2020, 1, 27),
}

dag = DAG(
    dag_id='my_second_dag',
    default_args=args,
    schedule_interval=timedelta(minutes=150),
)


def _print_exec_date(**context):
    print("This is my execution date: " + str(context["execution_date"]))


print_execution_date = PythonOperator(
    task_id="print_execution_date",
    python_callable=_print_exec_date,
    provide_context=True,
    dag=dag,
)

for i in (1, 5, 10):
    wait = BashOperator(task_id=f"wait_{i}", bash_command=f"sleep {i}")

the_end = DummyOperator(
    task_id='the_end',
    dag=dag,
)

print_execution_date >> wait >> the_end
    if test_mode:
        print(" 'foo' was passed in via test={} command : kwargs[params][foo] \
               = {}".format(test_mode, params["foo"]))
    # Print out the value of "miff", passed in below via the Python Operator
    print(" 'miff' was passed in via task params = {}".format(params["miff"]))
    return 1


my_templated_command = """
    echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} "
    echo " 'miff was passed in via BashOperator with value {{ params.miff }} "
"""

run_this = PythonOperator(
    task_id='run_this',
    python_callable=my_py_command,
    params={"miff": "agg"},
    dag=dag,
)

also_run_this = BashOperator(
    task_id='also_run_this',
    bash_command=my_templated_command,
    params={"miff": "agg"},
    dag=dag,
)


def print_env_vars(test_mode):
    """
    Print out the "foo" param passed in via
    `airflow tasks test example_passing_params_via_test_command env_var_test_task <date>
示例#8
0
import datetime
import logging

from airflow import DAG
from airflow.operators.python_operator import PythonOperator


def hello_world():
    logging.info("Hello Ben!")


dag = DAG('Exercise1', start_date=datetime.datetime.now())

greet_task = PythonOperator(task_id="hello_world_task",
                            python_callable=hello_world,
                            dag=dag)
示例#9
0
def greet():
    print('Trying to write to the file')
    with open('./greet.txt', 'a+', encoding='utf8') as f:
        now = dt.datetime.now()
        t = now.strftime("%Y-%m-%d %H:%M")
        f.write(str(t) + '\n')
    return 'Greeted'

def respond():
    return 'Greet Responded Again'

default_args = {
    'owner': 'airflow',
    'start_date': dt.datetime(2018, 9, 24, 10, 00, 00),
    'concurrency': 1,
    'retries': 0
}

with DAG('my_simple_dag',
         default_args=default_args,
         schedule_interval='*/10 * * * *',
         ) as dag:
opr_hello = BashOperator(task_id='say_Hi', bash_command='echo "Hi!!"')

opr_greet = PythonOperator(task_id='greet', python_callable=greet)

opr_sleep = BashOperator(task_id='sleep_me', bash_command='sleep 5')

opr_respond = PythonOperator(task_id='respond', python_callable=respond)

opr_hello >> opr_greet >> opr_sleep >> opr_respond
示例#10
0
        parameters={
            "first_parameter": "a_value",
            "second_parameter": "18"
        },
        # parameters="resources/paremeter.json", You can also pass a path to a json file containing your param
        jenkins_connection_id=
        "your_jenkins_connection"  # T he connection must be configured first
    )

    def grab_artifact_from_jenkins(**context):
        """
        Grab an artifact from the previous job
        The python-jenkins library doesn't expose a method for that
        But it's totally possible to build manually the request for that
        """
        hook = JenkinsHook("your_jenkins_connection")
        jenkins_server = hook.get_jenkins_server()
        url = context['task_instance'].xcom_pull(task_ids='trigger_job')
        # The JenkinsJobTriggerOperator store the job url in the xcom variable corresponding to the task
        # You can then use it to access things or to get the job number
        # This url looks like : http://jenkins_url/job/job_name/job_number/
        url = url + "artifact/myartifact.xml"  # Or any other artifact name
        request = Request(url)
        response = jenkins_server.jenkins_open(request)
        return response  # We store the artifact content in a xcom variable for later use

    artifact_grabber = PythonOperator(
        task_id='artifact_grabber', python_callable=grab_artifact_from_jenkins)

    job_trigger >> artifact_grabber
# -------------------------------------------------------------------------------
# dag
# these args will get passed on to each operator
# you can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'TongYu',
    'catchup': False,
    'start_date': trans_utc_datetime('0:00:00'),
}
dag = DAG(
    'valuation_date_update_dag',
    catchup=False,
    default_args=default_args,
    schedule_interval='0,1 17 * * *',
    dagrun_timeout=timedelta(minutes=10),
    description='valuation date manager dag')
# ----------------------------------------


PythonOperator(
    task_id='valuation_date_update_task',
    python_callable=set_valuation_date,
    execution_timeout=timedelta(minutes=10),
    dag=dag)

PythonOperator(
    task_id='calendar_import_year_update_task',
    python_callable=set_calendar_import_year,
    execution_timeout=timedelta(minutes=10),
    dag=dag)
def csvToJson():
    df=pd.read_csv('/home/demilsonfayika/dirty-data.csv')
    for i,r in df.iterrows():
        print(r['name'])
    df.to_json('anglo.json',orient='records')

	


default_args = {
    'owner': 'Demilson',
    'start_date': dt.datetime(2020, 12, 15),
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
}


with DAG('MyCSVDAG',
         default_args=default_args,
         schedule_interval=timedelta(minutes=5),      # '0 * * * *',
         ) as dag:

    print_starting = BashOperator(task_id='starting',
                               bash_command='echo "I am reading the CSV now....."')
    
    csvJson = PythonOperator(task_id='convertCSVtoJson',
                                 python_callable=csvToJson)


print_starting >> csvJson
示例#13
0
                               object=f"rocket_launches/ds={ds}",
                               mime_type='application/json')


def _print_stats(ds, **context):
    gcloud_storage_hook = GoogleCloudStorageHook()
    tmp_file_handle = NamedTemporaryFile(delete=True)
    gcloud_storage_hook.download(bucket="nice_bucket",
                                 object=f"rocket_launches/ds={ds}",
                                 filename=tmp_file_handle.name)
    data = json.load(tmp_file_handle)
    rockets_launched = [launch["name"] for launch in data["launches"]]
    rockets_str = ""
    if rockets_launched:
        rockets_str = f" ({' & '.join(rockets_launched)})"
        print(
            f"{len(rockets_launched)} rocket launch(es) on {ds}{rockets_str}.")


download_rocket_launches = PythonOperator(
    task_id="download_rocket_launches",
    python_callable=_download_rocket_launches,
    provide_context=True,
    dag=dag,
)
print_stats = PythonOperator(task_id="print_stats",
                             python_callable=_print_stats,
                             provide_context=True,
                             dag=dag)
download_rocket_launches >> print_stats
示例#14
0
    task_id="create_oldest",
    dag=dag,
    sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
    postgres_conn_id="redshift"
)

log_oldest_task = PythonOperator(
    task_id="log_oldest",
    dag=dag,
    python_callable=log_oldest
)


create_youngest_task = PostgresOperator(
    task_id="create_youngest",
    dag=dag,
    sql="""
        BEGIN;
        DROP TABLE IF EXISTS younger_riders;
        CREATE TABLE younger_riders AS (
            SELECT * FROM trips WHERE birthyear > 2000
        );
        COMMIT;
    """,
示例#15
0
def transform_codes_to_parquet(**kwargs):
    # ti is the Task Instance
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    cluster_dns = emr.get_cluster_dns(cluster_id)
    headers = emr.create_spark_session(cluster_dns, 'spark')
    session_url = emr.wait_for_idle_session(cluster_dns, headers)
    statement_response = emr.submit_statement(
        session_url, '/root/airflow/dags/transform/codes.scala')
    emr.track_statement_progress(cluster_dns, statement_response.headers)
    emr.kill_spark_session(session_url)


# Define the individual tasks using Python Operators
create_cluster = PythonOperator(task_id='create_cluster',
                                python_callable=create_emr,
                                dag=dag)

wait_for_cluster_completion = PythonOperator(
    task_id='wait_for_cluster_completion',
    python_callable=wait_for_completion,
    dag=dag)

transform_movies = PythonOperator(task_id='transform_codes',
                                  python_callable=transform_codes_to_parquet,
                                  dag=dag)

terminate_cluster = PythonOperator(task_id='terminate_cluster',
                                   python_callable=terminate_emr,
                                   trigger_rule='all_done',
                                   dag=dag)
示例#16
0
def MonthlyPipeline():
    MONTHLY_RELEASE_TRIGGER = '15 17 * * 4#3'

    def MonthlyGenerateTestArgs(**kwargs):
        """Loads the configuration that will be used for this Iteration."""
        conf = kwargs['dag_run'].conf
        if conf is None:
            conf = dict()

        # If version is overridden then we should use it otherwise we use it's
        # default or monthly value.
        version = conf.get('VERSION') or istio_common_dag.GetVariableOrDefault(
            'monthly-version', None)
        if not version or version == 'INVALID':
            raise ValueError('version needs to be provided')
        Variable.set('monthly-version', 'INVALID')

        #GCS_MONTHLY_STAGE_PATH is of the form ='prerelease/{version}'
        gcs_path = 'prerelease/%s' % (version)

        branch = conf.get('BRANCH') or istio_common_dag.GetVariableOrDefault(
            'monthly-branch', None)
        if not branch or branch == 'INVALID':
            raise ValueError('branch needs to be provided')
        Variable.set('monthly-branch', 'INVALID')
        commit = conf.get('COMMIT') or branch
        mfest_commit = conf.get('MFEST_COMMIT') or branch

        default_conf = environment_config.GetDefaultAirflowConfig(
            branch=branch,
            commit=commit,
            gcs_path=gcs_path,
            mfest_commit=mfest_commit,
            pipeline_type='monthly',
            verify_consistency='true',
            version=version)

        config_settings = dict()
        for name in default_conf.iterkeys():
            config_settings[name] = conf.get(name) or default_conf[name]

        # These are the extra params that are passed to the dags for monthly release
        monthly_conf = dict()
        monthly_conf['DOCKER_HUB'] = 'istio'
        monthly_conf['GCR_RELEASE_DEST'] = 'istio-io'
        monthly_conf['GCS_GITHUB_PATH'] = 'istio-secrets/github.txt.enc'
        monthly_conf['RELEASE_PROJECT_ID'] = 'istio-io'
        # GCS_MONTHLY_RELEASE_PATH is of the form  'istio-release/releases/{version}'
        monthly_conf[
            'GCS_MONTHLY_RELEASE_PATH'] = 'istio-release/releases/%s' % (
                version)
        for name in monthly_conf.iterkeys():
            config_settings[name] = conf.get(name) or monthly_conf[name]

        testMonthlyConfigSettings(config_settings)
        return config_settings

    def ReportMonthlySuccessful(task_instance, **kwargs):
        del kwargs

    dag, tasks, addAirflowBashOperator = istio_common_dag.MakeCommonDag(
        MonthlyGenerateTestArgs,
        'istio_monthly_dag',
        schedule_interval=MONTHLY_RELEASE_TRIGGER,
        extra_param_lst=monthly_extra_params)

    addAirflowBashOperator('release_push_github_docker_template',
                           'github_and_docker_release',
                           need_commit=True)
    addAirflowBashOperator('release_tag_github_template',
                           'github_tag_repos',
                           need_commit=True)

    mark_monthly_complete = PythonOperator(
        task_id='mark_monthly_complete',
        python_callable=ReportMonthlySuccessful,
        provide_context=True,
        dag=dag,
    )
    tasks['mark_monthly_complete'] = mark_monthly_complete

    # tasks['generate_workflow_args']
    tasks['get_git_commit'].set_upstream(tasks['generate_workflow_args'])
    tasks['run_cloud_builder'].set_upstream(tasks['get_git_commit'])
    tasks['run_release_qualification_tests'].set_upstream(
        tasks['run_cloud_builder'])
    tasks['modify_values_helm'].set_upstream(
        tasks['run_release_qualification_tests'])
    tasks['copy_files_for_release'].set_upstream(tasks['modify_values_helm'])
    tasks['github_and_docker_release'].set_upstream(
        tasks['copy_files_for_release'])
    tasks['github_tag_repos'].set_upstream(tasks['github_and_docker_release'])
    tasks['mark_monthly_complete'].set_upstream(tasks['github_tag_repos'])

    return dag
示例#17
0
    if not os.path.exists(outdir):
       os.mkdir(outdir)
    output_df.to_csv(str(outdir+'/predout.csv'))

def pull_test_data():
    fetch_test_data =ast.literal_eval(Variable.get("fetch_test_data"))
    response=invoke_endpoint(fetch_test_data)
    output_dict=ast.literal_eval(response['1:Fetch Housing Price Test Data']['message'])
    output_df=pd.DataFrame(output_dict)
    outdir='/storage/housing_price_prediction'
    if not os.path.exists(outdir):
       os.mkdir(outdir)
    output_df.to_csv(str(outdir+'/test.csv'))
    
t1 = PythonOperator(
    task_id='testing_data_processing',
    python_callable=testing_data_processing,
    dag=dag)

t2 = PythonOperator(
    task_id='testing_model',
    python_callable=testing_model,
    dag=dag)

t3 = PythonOperator(
    task_id='pull_predicted_labels',
    python_callable=pull_predicted_labels,
    dag=dag)
    
t4 = PythonOperator(
    task_id='pull_test_data',
    python_callable=pull_test_data,
    dag_ids = dag.dag_id

    msg = [{
        "dag": dag,
        "db": "opay_dw",
        "table": "{dag_name}".format(dag_name=dag_ids),
        "partition": "country_code=NG/dt={pt}".format(pt=ds),
        "timeout": "3000"
    }]

    TaskTimeoutMonitor().set_task_monitor(msg)


task_timeout_monitor = PythonOperator(task_id='task_timeout_monitor',
                                      python_callable=fun_task_timeout_monitor,
                                      provide_context=True,
                                      dag=dag)

##----------------------------------------- 变量 ---------------------------------------##
db_name = "opay_dw"

table_name = "dm_opay_user_trans_portrait_df"
hdfs_path = "oss://opay-datalake/opay/opay_dw/" + table_name


def dm_opay_user_trans_portrait_df_sql_task(ds):
    HQL = '''
    
    set mapred.max.split.size=1000000;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.parallel=true;
                              poke_interval=10,
                              timeout=300)


def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df = pd.read_csv(file_path, header=0, encoding='ISO-8859-1')
    df.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file


transform_process = PythonOperator(dag=dag,
                                   task_id='transform_process',
                                   python_callable=transform_func,
                                   provide_context=True)


def insert_process(**kwargs):
    ti = kwargs['ti']
    source_file = ti.xcom_pull(task_ids='transform_process')
    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()

    df = pd.read_csv(source_file)

    with db_connection.begin() as transaction:
        transaction.execute("DELETE FROM test.confirmed_table WHERE 1=1")
        df.to_sql("confirmed_table",
                  con=transaction,
                  schema="test",
示例#20
0
# Instructions
# Define a function that uses the python logger to log a function. Then finish filling in the details of the DAG down below. Once you’ve done that, run "/opt/airflow/start.sh" command to start the web server. Once the Airflow web server is ready,  open the Airflow UI using the "Access Airflow" button. Turn your DAG “On”, and then Run your DAG. If you get stuck, you can take a look at the solution file or the video walkthrough on the next page.

import datetime
import logging

from airflow import DAG
from airflow.operators.python_operator import PythonOperator


#
# TODO: Define a function for the PythonOperator to call and have it log something
#
def my_function():
    logging.info('This is my first pipeline')


dag = DAG('lesson1.exercise1', start_date=datetime.datetime.now())

#
# TODO: Uncomment the operator below and replace the arguments labeled <REPLACE> below
#

greet_task = PythonOperator(task_id="task1",
                            python_callable=my_function,
                            dag=dag)
示例#21
0
dag = DAG(
    dag_id="my_simple_dag",
    start_date=datetime(year=2020, month=1, day=1, hour=12, minute=1,
                        second=1),
    schedule_interval="@yearly",
    max_active_runs=1,
)
opr_hello = BashOperator(
    task_id="say_Hi",
    bash_command='echo "Hi!!"',
    dag=dag,
)

opr_greet = PythonOperator(
    task_id="greet",
    python_callable=greet,
    dag=dag,
)
opr_sleep = BashOperator(
    task_id="sleep_me",
    bash_command="sleep 5",
    dag=dag,
)

opr_respond = PythonOperator(
    task_id="respond",
    python_callable=respond,
    dag=dag,
)

opr_spark = PythonOperator(
dag = DAG('lesson2.exercise3',
          start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
          end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
          schedule_interval='@monthly',
          max_active_runs=1)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    provide_context=True,  # provide context to our Python Operator
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = PythonOperator(
    task_id='load_stations_from_s3_to_redshift',
    dag=dag,
    python_callable=load_station_data_to_redshift,
)
示例#23
0
with DAG("store_dag",
         default_args=default_args,
         schedule_interval='@daily',
         template_searchpath=['/usr/local/airflow/sql_files'],
         catchup=False) as dag:

    # task1: check if the source file exists in the input directory
    # note that the file in airflow container.
    t1 = BashOperator(
        task_id="check_file_exists",
        bash_command="shasum ~/store_files_airflow/raw_store_transactions.csv",
        retries=1,
        retry_delay=timedelta(seconds=15))

    # task 2: clean data (remove special characters)
    t2 = PythonOperator(task_id="clean_raw_csv", python_callable=data_cleaner)

    # task 3: create table
    t3 = MySqlOperator(task_id="create_mysql_table",
                       mysql_conn_id="mysql_conn",
                       sql="create_table.sql")

    # task 4: insert cleaned data into table
    t4 = MySqlOperator(task_id="insert_into_table",
                       mysql_conn_id="mysql_conn",
                       sql="insert_into_table.sql",
                       dag=dag)

    # task 5: calculate store-wise and location-wise profit (yesterday) and save results as csv
    t5 = MySqlOperator(task_id="select_from_table",
                       mysql_conn_id="mysql_conn",
示例#24
0
api_tse_cand_contas_extraction = DivulgacaoCandContasTSEProcessor(
    db_uri=MONGO_URI, election_year=TSE_YEAR_DATA)

args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(730),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1
}

dag = DAG(dag_id='import_candidate_extra_data',
          default_args=args,
          schedule_interval=None,
          dagrun_timeout=timedelta(minutes=60))

start_import = DummyOperator(task_id='start_import', dag=dag)

import_candidate_extra_data = PythonOperator(
    task_id='import_extra_data_candidate',
    python_callable=api_tse_cand_contas_extraction.run,
    dag=dag)

import_candidate_extra_data.set_upstream(start_import)

if __name__ == '__main__':
    dag.cli()
示例#25
0
        data.rename(columns=COLUMNS).to_sql('muertes', con=connection, schema='test', if_exists='append', index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted {len(data.index)}")


dag = DAG('muertos', description='Muertos',
          default_args={
              'owner': 'jsique',
              'depends_on_past': False,
              'max_active_runs': 1,
              'start_date': days_ago(5)
          },
          schedule_interval='0 1 * * *',
          catchup=False)

sensor = FileSensor(task_id="file_sensor_task_muertos",
                    dag=dag,
                    filepath='time_series_covid19_deaths_global.csv',
                    fs_conn_id=FILE_CONNECTION_NAME,
                    poke_interval=10,
                    timeout=600)

etl = PythonOperator(task_id="dead_etl",
                     provide_context=True,
                     python_callable=etl_process,
                     dag=dag
                     )

sensor >> etl
示例#26
0
    df = pd.DataFrame.from_dict(records, orient='index')
    df['reviewText'] = df['reviewText'].apply(lambda x: x.replace('|', ''))
    df.to_csv(output_path, sep='|', index=False)


with DAG(
        dag_id="create_fact_review_table",
        schedule_interval="@daily",
        default_args=default_args,
        catchup=False) as dag:


    # Step 1: Unzip and store as csv
    unzip_file_store_as_csv = PythonOperator(
        task_id='unzip_file_store_as_csv',
        python_callable=unzip_to_csv,
        op_kwargs={'input_path': '/usr/local/airflow/dags/files/reviews_Musical_Instruments.json.gz',
                   'output_path': '/usr/local/airflow/dags/files/review_data.csv'}
    )

    # Step 2: Move json file to hdfs storage
    move_to_hdfs = BashOperator(
        task_id="move_to_hdfs",
        bash_command="""
            hdfs dfs -mkdir -p /fact_review && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/review_data.csv /fact_review
            """
    )

    # Step 3: Create a hive table on our sku_data
    creating_fact_table = HiveOperator(
        task_id="creating_fact_review_table",
    if weekday == "Mon":
        person = "email_bob"
    elif weekday == "Wed":
        person = "email_alice"
    elif weekday == "Fri":
        person = "email_joe"
    else:
        person = "unkown"

    return person


print_week_day = PythonOperator(
    task_id="print_week_day",
    python_callable=_get_weekday,
    provide_context=True,
    dag=dag,
)

branching = BranchPythonOperator(task_id="branching",
                                 python_callable=_get_weekday,
                                 provide_context=True,
                                 dag=dag)

join = DummyOperator(task_id="join", trigger_rule="none_failed", dag=dag)

print_week_day >> branching

persons = ["email_bob", "email_alice", "email_joe"]
for person in persons:
    branching >> DummyOperator(task_id=person, dag=dag) >> join
示例#28
0
# [END howto_operator_http_task_del_op]
# [START howto_operator_http_http_sensor_check]
task_http_sensor_check = HttpSensor(
    task_id='api_health_check',
    http_conn_id='rest-connection',
    endpoint='/',
    request_params={},
    # response_check=lambda response: "httpbin" in response.text,
    poke_interval=5,
    # on_failure_callback=notify_email,
    dag=dag,
)

# Task 3: Save JSON data locally
# save_and_transform = PythonOperator(
#     task_id="save_and_transform", 
#     python_callable=transform_json,
#     provide_context=True,
# )

save_employee = PythonOperator(
    task_id="save_employee_transform", 
    python_callable=save_emp_json,
    provide_context=True
)

task_http_sensor_check >> task_save_employee >> save_employee >> task_get_byid_employee
save_employee >> task_get_all_employee
# save_employee >> task_update_employee >> task_get_byid_employee
示例#29
0
import sys
from airflow.operators.python_operator import PythonOperator
from datetime import datetime
from airflow import DAG
# business imports
sys.path.append('/home/airflow/python/')
from cl_cr_update import jira_update

dag = DAG('cl_cr_update',
          description='Run update jira isussues.',
          schedule_interval='*/5 * * * *',
          start_date=datetime(2019, 12, 7), catchup=False)

step_1 = PythonOperator(task_id='step_1',
                        python_callable=jira_update,
                        dag=dag)
示例#30
0
        callback_url = conf_json['callback_url']

        input_json_dir = Path(conf_json['input_json_dir'])

        context['ti'].xcom_push(key='mount_volumes', value=mount_volumes)
        context['ti'].xcom_push(key='callback_url', value=callback_url)
        context['ti'].xcom_push(key='input_json_dir', value=input_json_dir)

    def post_process(**context):
        print('start post process.')
        callback_url = context['ti'].xcom_pull(key='callback_url')
        requests.post(callback_url)

    t1 = PythonOperator(
        task_id='pre_process',
        python_callable=pre_process,
        dag=dag
    )

    t3 = PythonOperator(
        task_id='post_process',
        python_callable=post_process,
        trigger_rule=TriggerRule.ALL_DONE, # error発生時でも必ず実行する
        dag=dag
    )

    t2 = DockerOperatorEx(
        task_id='main_process',
        image='qunomon/eval_mnist_data_coverage:0.1',
        docker_url='unix://var/run/docker.sock',
        api_version='auto',