Python PythonOperator.PythonOperator示例，airflow.operators.python_operator.PythonOperator.PythonOperator Python示例

示例#1

0

显示文件

文件： Hello_world3.py 项目： DataScienceEngineer/DataScienceForEnterprises

dag = DAG('hello_world3',
          default_args=default_args,
          schedule_interval=None,
          catchup=False)


def print_context():
    print("Hello yoo")
    with open('/storage/hello-world-example/some1.txt', 'a') as a:
        a.write("Executed now2 yo 1" + str(datetime.now()))
    time.sleep(10)
    return 'Whatever you return gets printed in the logs'


def print_context2():
    with open('/storage/hello-world-example/some1.txt', 'a') as a:
        a.write("Executed now2 yo 2" + str(datetime.now()))
    time.sleep(15)
    return 'Whatever you return gets printed in the logs'


t1 = PythonOperator(task_id='print_the_context1',
                    python_callable=print_context,
                    dag=dag)

t2 = PythonOperator(task_id='print_the_context2',
                    python_callable=print_context2,
                    dag=dag)

t2.set_upstream(t1)

示例#2

0

显示文件

文件： air_quality_index.py 项目： sivakguru/apache_airflow

    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

with DAG(
        'air_quality_index',
        default_args=default_args,
        schedule_interval='@hourly',
) as dag:

    task_1 = PythonOperator(task_id='create_table',
                            provide_context=True,
                            python_callable=create_statement,
                            op_kwargs={
                                'host': 'localhost',
                                'dbname': 'postgres',
                                'user': '******',
                                'password': '******',
                                'statement': create_statment
                            },
                            dag=dag)

    task_2 = PythonOperator(task_id='get_data_from_api',
                            provide_context=True,
                            python_callable=get_data_api,
                            dag=dag)

    task_3 = PythonOperator(task_id='load_json_file',
                            provide_context=True,
                            python_callable=load_json_file,
                            dag=dag)

示例#3

0

显示文件

文件： opay_source_sqoop_hf.py 项目： lishuailishuai/shanchu2

                   username=conn_conf_dict[conn_id].login,
                   password=conn_conf_dict[conn_id].password,
                   table=table_name,
                   ufile_path=UFILE_PATH % (db_name, table_name),
                   query=query,
                   m=18 if table_name == 'channel_response_code' else 20),
        dag=dag,
    )

    # check table
    check_table = PythonOperator(
        task_id='check_table_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        python_callable=run_check_table,
        provide_context=True,
        op_kwargs={
            'db_name': db_name,
            'table_name': table_name,
            'conn_id': conn_id,
            'hive_table_name': hive_table_name
        },
        dag=dag)
    # add partitions
    add_partitions = HiveOperator(
        task_id='add_partitions_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        hql='''
                ALTER TABLE {table} ADD IF NOT EXISTS PARTITION (dt = '{{{{ tomorrow_ds }}}}',hour = '{{{{ execution_date.strftime("%H") }}}}')
            '''.format(table=hive_table_name),
        schema=HIVE_DB,
        dag=dag)

示例#4

0

显示文件

文件： fill_fixed_dimensions_dag.py 项目： brutway/de_capstone_project

    from (
        select 1 as star_rating, 'Very bad' as rating_title
        union all
        select 2 as star_rating, 'Bad' as rating_title
        union all
        select 3 as star_rating, 'Not good' as rating_title
        union all
        select 4 as star_rating, 'Good' as rating_title
        union all
        select 5 as star_rating, 'VeryExcellent' as rating_title
    ) r;
    """
    cur.execute(query)
    conn.commit()


create_calendar = PythonOperator(task_id="fill_calendar",
                                 dag=dag,
                                 python_callable=fill_calendar,
                                 op_kwargs={"dwh_conn_id": "dwh"})

create_rating = PythonOperator(task_id="fill_rating",
                               dag=dag,
                               python_callable=fill_rating,
                               op_kwargs={"dwh_conn_id": "dwh"})
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_calendar >> end_operator
start_operator >> create_rating >> end_operator

示例#5

0

显示文件

文件： extract.py 项目： rohithreddykota/airflow-covid

    'schedule_interval': '@daily',
    'max_active_runs': 1,
    'retries': 3,
    'retry_delay': timedelta(minutes=1)
}

DAG_ID = 'covid'

with DAG(dag_id=DAG_ID, default_args=DEFAULT_ARGS) as dag:
    with open('/usr/local/airflow/dags/covid/sqls/ddl_covid_timeseries.sql', 'r') as f:
        sql = f.read()
    create_table_if_not_exists = PostgresOperator(task_id='check_table', postgres_conn_id='postgres_staging' ,sql=sql)

    extract = PythonOperator(
        task_id='extract',
        python_callable=get_covid_data,
        op_kwargs={'uri':'https://api.covid19india.org/v4/timeseries.json'},
        provide_context=True
    )

    transform = PythonOperator(
        task_id='transform',
        python_callable=json_to_df,
        op_kwargs={'start_date': '2020-01-01', 
                'end_date': '9999-12-31'},
        provide_context=True
    )

    stage_load = PythonOperator(
        task_id='stage_load',
        python_callable=load_to_stage_area,
        provide_context=True

示例#6

0

显示文件

文件： my_second_dag.py 项目： machieltreffers/airflow-training-skeleton

    'start_date': datetime(2020, 1, 27),
}

dag = DAG(
    dag_id='my_second_dag',
    default_args=args,
    schedule_interval=timedelta(minutes=150),
)


def _print_exec_date(**context):
    print("This is my execution date: " + str(context["execution_date"]))


print_execution_date = PythonOperator(
    task_id="print_execution_date",
    python_callable=_print_exec_date,
    provide_context=True,
    dag=dag,
)

for i in (1, 5, 10):
    wait = BashOperator(task_id=f"wait_{i}", bash_command=f"sleep {i}")

the_end = DummyOperator(
    task_id='the_end',
    dag=dag,
)

print_execution_date >> wait >> the_end

示例#7

0

显示文件

文件： example_passing_params_via_test_command.py 项目： mayrop/docker-airflow

    if test_mode:
        print(" 'foo' was passed in via test={} command : kwargs[params][foo] \
               = {}".format(test_mode, params["foo"]))
    # Print out the value of "miff", passed in below via the Python Operator
    print(" 'miff' was passed in via task params = {}".format(params["miff"]))
    return 1


my_templated_command = """
    echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} "
    echo " 'miff was passed in via BashOperator with value {{ params.miff }} "
"""

run_this = PythonOperator(
    task_id='run_this',
    python_callable=my_py_command,
    params={"miff": "agg"},
    dag=dag,
)

also_run_this = BashOperator(
    task_id='also_run_this',
    bash_command=my_templated_command,
    params={"miff": "agg"},
    dag=dag,
)


def print_env_vars(test_mode):
    """
    Print out the "foo" param passed in via
    `airflow tasks test example_passing_params_via_test_command env_var_test_task <date>

示例#8

0

显示文件

import datetime
import logging

from airflow import DAG
from airflow.operators.python_operator import PythonOperator


def hello_world():
    logging.info("Hello Ben!")


dag = DAG('Exercise1', start_date=datetime.datetime.now())

greet_task = PythonOperator(task_id="hello_world_task",
                            python_callable=hello_world,
                            dag=dag)

示例#9

0

显示文件

文件： my_first_dag.py 项目： Asterka/airflow-tutorial

def greet():
    print('Trying to write to the file')
    with open('./greet.txt', 'a+', encoding='utf8') as f:
        now = dt.datetime.now()
        t = now.strftime("%Y-%m-%d %H:%M")
        f.write(str(t) + '\n')
    return 'Greeted'

def respond():
    return 'Greet Responded Again'

default_args = {
    'owner': 'airflow',
    'start_date': dt.datetime(2018, 9, 24, 10, 00, 00),
    'concurrency': 1,
    'retries': 0
}

with DAG('my_simple_dag',
         default_args=default_args,
         schedule_interval='*/10 * * * *',
         ) as dag:
opr_hello = BashOperator(task_id='say_Hi', bash_command='echo "Hi!!"')

opr_greet = PythonOperator(task_id='greet', python_callable=greet)

opr_sleep = BashOperator(task_id='sleep_me', bash_command='sleep 5')

opr_respond = PythonOperator(task_id='respond', python_callable=respond)

opr_hello >> opr_greet >> opr_sleep >> opr_respond

示例#10

0

显示文件

        parameters={
            "first_parameter": "a_value",
            "second_parameter": "18"
        },
        # parameters="resources/paremeter.json", You can also pass a path to a json file containing your param
        jenkins_connection_id=
        "your_jenkins_connection"  # T he connection must be configured first
    )

    def grab_artifact_from_jenkins(**context):
        """
        Grab an artifact from the previous job
        The python-jenkins library doesn't expose a method for that
        But it's totally possible to build manually the request for that
        """
        hook = JenkinsHook("your_jenkins_connection")
        jenkins_server = hook.get_jenkins_server()
        url = context['task_instance'].xcom_pull(task_ids='trigger_job')
        # The JenkinsJobTriggerOperator store the job url in the xcom variable corresponding to the task
        # You can then use it to access things or to get the job number
        # This url looks like : http://jenkins_url/job/job_name/job_number/
        url = url + "artifact/myartifact.xml"  # Or any other artifact name
        request = Request(url)
        response = jenkins_server.jenkins_open(request)
        return response  # We store the artifact content in a xcom variable for later use

    artifact_grabber = PythonOperator(
        task_id='artifact_grabber', python_callable=grab_artifact_from_jenkins)

    job_trigger >> artifact_grabber

示例#11

0

显示文件

文件： valuation_date_update_dag.py 项目： zhanrendong/jkzx1

# -------------------------------------------------------------------------------
# dag
# these args will get passed on to each operator
# you can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'TongYu',
    'catchup': False,
    'start_date': trans_utc_datetime('0:00:00'),
}
dag = DAG(
    'valuation_date_update_dag',
    catchup=False,
    default_args=default_args,
    schedule_interval='0,1 17 * * *',
    dagrun_timeout=timedelta(minutes=10),
    description='valuation date manager dag')
# ----------------------------------------


PythonOperator(
    task_id='valuation_date_update_task',
    python_callable=set_valuation_date,
    execution_timeout=timedelta(minutes=10),
    dag=dag)

PythonOperator(
    task_id='calendar_import_year_update_task',
    python_callable=set_calendar_import_year,
    execution_timeout=timedelta(minutes=10),
    dag=dag)

示例#12

0

显示文件

文件： AngloAirflowCSV.py 项目： dfayika1988/Tech_testanglo

def csvToJson():
    df=pd.read_csv('/home/demilsonfayika/dirty-data.csv')
    for i,r in df.iterrows():
        print(r['name'])
    df.to_json('anglo.json',orient='records')

	


default_args = {
    'owner': 'Demilson',
    'start_date': dt.datetime(2020, 12, 15),
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
}


with DAG('MyCSVDAG',
         default_args=default_args,
         schedule_interval=timedelta(minutes=5),      # '0 * * * *',
         ) as dag:

    print_starting = BashOperator(task_id='starting',
                               bash_command='echo "I am reading the CSV now....."')
    
    csvJson = PythonOperator(task_id='convertCSVtoJson',
                                 python_callable=csvToJson)


print_starting >> csvJson

示例#13

0

显示文件

                               object=f"rocket_launches/ds={ds}",
                               mime_type='application/json')


def _print_stats(ds, **context):
    gcloud_storage_hook = GoogleCloudStorageHook()
    tmp_file_handle = NamedTemporaryFile(delete=True)
    gcloud_storage_hook.download(bucket="nice_bucket",
                                 object=f"rocket_launches/ds={ds}",
                                 filename=tmp_file_handle.name)
    data = json.load(tmp_file_handle)
    rockets_launched = [launch["name"] for launch in data["launches"]]
    rockets_str = ""
    if rockets_launched:
        rockets_str = f" ({' & '.join(rockets_launched)})"
        print(
            f"{len(rockets_launched)} rocket launch(es) on {ds}{rockets_str}.")


download_rocket_launches = PythonOperator(
    task_id="download_rocket_launches",
    python_callable=_download_rocket_launches,
    provide_context=True,
    dag=dag,
)
print_stats = PythonOperator(task_id="print_stats",
                             python_callable=_print_stats,
                             provide_context=True,
                             dag=dag)
download_rocket_launches >> print_stats

示例#14

0

显示文件

文件： exercise2.py 项目： git-nami/Data_Engineering

    task_id="create_oldest",
    dag=dag,
    sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
    postgres_conn_id="redshift"
)

log_oldest_task = PythonOperator(
    task_id="log_oldest",
    dag=dag,
    python_callable=log_oldest
)


create_youngest_task = PostgresOperator(
    task_id="create_youngest",
    dag=dag,
    sql="""
        BEGIN;
        DROP TABLE IF EXISTS younger_riders;
        CREATE TABLE younger_riders AS (
            SELECT * FROM trips WHERE birthyear > 2000
        );
        COMMIT;
    """,

示例#15

0

显示文件

def transform_codes_to_parquet(**kwargs):
    # ti is the Task Instance
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    cluster_dns = emr.get_cluster_dns(cluster_id)
    headers = emr.create_spark_session(cluster_dns, 'spark')
    session_url = emr.wait_for_idle_session(cluster_dns, headers)
    statement_response = emr.submit_statement(
        session_url, '/root/airflow/dags/transform/codes.scala')
    emr.track_statement_progress(cluster_dns, statement_response.headers)
    emr.kill_spark_session(session_url)


# Define the individual tasks using Python Operators
create_cluster = PythonOperator(task_id='create_cluster',
                                python_callable=create_emr,
                                dag=dag)

wait_for_cluster_completion = PythonOperator(
    task_id='wait_for_cluster_completion',
    python_callable=wait_for_completion,
    dag=dag)

transform_movies = PythonOperator(task_id='transform_codes',
                                  python_callable=transform_codes_to_parquet,
                                  dag=dag)

terminate_cluster = PythonOperator(task_id='terminate_cluster',
                                   python_callable=terminate_emr,
                                   trigger_rule='all_done',
                                   dag=dag)

示例#16

0

显示文件

文件： istio_monthly_dag.py 项目： 929796111/istio-1

def MonthlyPipeline():
    MONTHLY_RELEASE_TRIGGER = '15 17 * * 4#3'

    def MonthlyGenerateTestArgs(**kwargs):
        """Loads the configuration that will be used for this Iteration."""
        conf = kwargs['dag_run'].conf
        if conf is None:
            conf = dict()

        # If version is overridden then we should use it otherwise we use it's
        # default or monthly value.
        version = conf.get('VERSION') or istio_common_dag.GetVariableOrDefault(
            'monthly-version', None)
        if not version or version == 'INVALID':
            raise ValueError('version needs to be provided')
        Variable.set('monthly-version', 'INVALID')

        #GCS_MONTHLY_STAGE_PATH is of the form ='prerelease/{version}'
        gcs_path = 'prerelease/%s' % (version)

        branch = conf.get('BRANCH') or istio_common_dag.GetVariableOrDefault(
            'monthly-branch', None)
        if not branch or branch == 'INVALID':
            raise ValueError('branch needs to be provided')
        Variable.set('monthly-branch', 'INVALID')
        commit = conf.get('COMMIT') or branch
        mfest_commit = conf.get('MFEST_COMMIT') or branch

        default_conf = environment_config.GetDefaultAirflowConfig(
            branch=branch,
            commit=commit,
            gcs_path=gcs_path,
            mfest_commit=mfest_commit,
            pipeline_type='monthly',
            verify_consistency='true',
            version=version)

        config_settings = dict()
        for name in default_conf.iterkeys():
            config_settings[name] = conf.get(name) or default_conf[name]

        # These are the extra params that are passed to the dags for monthly release
        monthly_conf = dict()
        monthly_conf['DOCKER_HUB'] = 'istio'
        monthly_conf['GCR_RELEASE_DEST'] = 'istio-io'
        monthly_conf['GCS_GITHUB_PATH'] = 'istio-secrets/github.txt.enc'
        monthly_conf['RELEASE_PROJECT_ID'] = 'istio-io'
        # GCS_MONTHLY_RELEASE_PATH is of the form  'istio-release/releases/{version}'
        monthly_conf[
            'GCS_MONTHLY_RELEASE_PATH'] = 'istio-release/releases/%s' % (
                version)
        for name in monthly_conf.iterkeys():
            config_settings[name] = conf.get(name) or monthly_conf[name]

        testMonthlyConfigSettings(config_settings)
        return config_settings

    def ReportMonthlySuccessful(task_instance, **kwargs):
        del kwargs

    dag, tasks, addAirflowBashOperator = istio_common_dag.MakeCommonDag(
        MonthlyGenerateTestArgs,
        'istio_monthly_dag',
        schedule_interval=MONTHLY_RELEASE_TRIGGER,
        extra_param_lst=monthly_extra_params)

    addAirflowBashOperator('release_push_github_docker_template',
                           'github_and_docker_release',
                           need_commit=True)
    addAirflowBashOperator('release_tag_github_template',
                           'github_tag_repos',
                           need_commit=True)

    mark_monthly_complete = PythonOperator(
        task_id='mark_monthly_complete',
        python_callable=ReportMonthlySuccessful,
        provide_context=True,
        dag=dag,
    )
    tasks['mark_monthly_complete'] = mark_monthly_complete

    # tasks['generate_workflow_args']
    tasks['get_git_commit'].set_upstream(tasks['generate_workflow_args'])
    tasks['run_cloud_builder'].set_upstream(tasks['get_git_commit'])
    tasks['run_release_qualification_tests'].set_upstream(
        tasks['run_cloud_builder'])
    tasks['modify_values_helm'].set_upstream(
        tasks['run_release_qualification_tests'])
    tasks['copy_files_for_release'].set_upstream(tasks['modify_values_helm'])
    tasks['github_and_docker_release'].set_upstream(
        tasks['copy_files_for_release'])
    tasks['github_tag_repos'].set_upstream(tasks['github_and_docker_release'])
    tasks['mark_monthly_complete'].set_upstream(tasks['github_tag_repos'])

    return dag

示例#17

0

显示文件

    if not os.path.exists(outdir):
       os.mkdir(outdir)
    output_df.to_csv(str(outdir+'/predout.csv'))

def pull_test_data():
    fetch_test_data =ast.literal_eval(Variable.get("fetch_test_data"))
    response=invoke_endpoint(fetch_test_data)
    output_dict=ast.literal_eval(response['1:Fetch Housing Price Test Data']['message'])
    output_df=pd.DataFrame(output_dict)
    outdir='/storage/housing_price_prediction'
    if not os.path.exists(outdir):
       os.mkdir(outdir)
    output_df.to_csv(str(outdir+'/test.csv'))
    
t1 = PythonOperator(
    task_id='testing_data_processing',
    python_callable=testing_data_processing,
    dag=dag)

t2 = PythonOperator(
    task_id='testing_model',
    python_callable=testing_model,
    dag=dag)

t3 = PythonOperator(
    task_id='pull_predicted_labels',
    python_callable=pull_predicted_labels,
    dag=dag)
    
t4 = PythonOperator(
    task_id='pull_test_data',
    python_callable=pull_test_data,

示例#18

0

显示文件

文件： dm_opay_user_trans_portrait_df.py 项目： lishuailishuai/shanchu2

    dag_ids = dag.dag_id

    msg = [{
        "dag": dag,
        "db": "opay_dw",
        "table": "{dag_name}".format(dag_name=dag_ids),
        "partition": "country_code=NG/dt={pt}".format(pt=ds),
        "timeout": "3000"
    }]

    TaskTimeoutMonitor().set_task_monitor(msg)


task_timeout_monitor = PythonOperator(task_id='task_timeout_monitor',
                                      python_callable=fun_task_timeout_monitor,
                                      provide_context=True,
                                      dag=dag)

##----------------------------------------- 变量 ---------------------------------------##
db_name = "opay_dw"

table_name = "dm_opay_user_trans_portrait_df"
hdfs_path = "oss://opay-datalake/opay/opay_dw/" + table_name


def dm_opay_user_trans_portrait_df_sql_task(ds):
    HQL = '''
    
    set mapred.max.split.size=1000000;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.parallel=true;

示例#19

0

显示文件

文件： confirmed.py 项目： victorusacgt/Proyecto_Product-Development

                              poke_interval=10,
                              timeout=300)


def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df = pd.read_csv(file_path, header=0, encoding='ISO-8859-1')
    df.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file


transform_process = PythonOperator(dag=dag,
                                   task_id='transform_process',
                                   python_callable=transform_func,
                                   provide_context=True)


def insert_process(**kwargs):
    ti = kwargs['ti']
    source_file = ti.xcom_pull(task_ids='transform_process')
    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()

    df = pd.read_csv(source_file)

    with db_connection.begin() as transaction:
        transaction.execute("DELETE FROM test.confirmed_table WHERE 1=1")
        df.to_sql("confirmed_table",
                  con=transaction,
                  schema="test",

示例#20

0

显示文件

# Instructions
# Define a function that uses the python logger to log a function. Then finish filling in the details of the DAG down below. Once you’ve done that, run "/opt/airflow/start.sh" command to start the web server. Once the Airflow web server is ready,  open the Airflow UI using the "Access Airflow" button. Turn your DAG “On”, and then Run your DAG. If you get stuck, you can take a look at the solution file or the video walkthrough on the next page.

import datetime
import logging

from airflow import DAG
from airflow.operators.python_operator import PythonOperator


#
# TODO: Define a function for the PythonOperator to call and have it log something
#
def my_function():
    logging.info('This is my first pipeline')


dag = DAG('lesson1.exercise1', start_date=datetime.datetime.now())

#
# TODO: Uncomment the operator below and replace the arguments labeled <REPLACE> below
#

greet_task = PythonOperator(task_id="task1",
                            python_callable=my_function,
                            dag=dag)

示例#21

0

显示文件

文件： dag_test.py 项目： arezamoosavi/3doors

dag = DAG(
    dag_id="my_simple_dag",
    start_date=datetime(year=2020, month=1, day=1, hour=12, minute=1,
                        second=1),
    schedule_interval="@yearly",
    max_active_runs=1,
)
opr_hello = BashOperator(
    task_id="say_Hi",
    bash_command='echo "Hi!!"',
    dag=dag,
)

opr_greet = PythonOperator(
    task_id="greet",
    python_callable=greet,
    dag=dag,
)
opr_sleep = BashOperator(
    task_id="sleep_me",
    bash_command="sleep 5",
    dag=dag,
)

opr_respond = PythonOperator(
    task_id="respond",
    python_callable=respond,
    dag=dag,
)

opr_spark = PythonOperator(

示例#22

0

显示文件

文件： 3_Data_Partitioning.py 项目： K-Ellis/Data-Engineering-Nanodegree

dag = DAG('lesson2.exercise3',
          start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
          end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
          schedule_interval='@monthly',
          max_active_runs=1)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    provide_context=True,  # provide context to our Python Operator
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = PythonOperator(
    task_id='load_stations_from_s3_to_redshift',
    dag=dag,
    python_callable=load_station_data_to_redshift,
)

示例#23

0

显示文件

with DAG("store_dag",
         default_args=default_args,
         schedule_interval='@daily',
         template_searchpath=['/usr/local/airflow/sql_files'],
         catchup=False) as dag:

    # task1: check if the source file exists in the input directory
    # note that the file in airflow container.
    t1 = BashOperator(
        task_id="check_file_exists",
        bash_command="shasum ~/store_files_airflow/raw_store_transactions.csv",
        retries=1,
        retry_delay=timedelta(seconds=15))

    # task 2: clean data (remove special characters)
    t2 = PythonOperator(task_id="clean_raw_csv", python_callable=data_cleaner)

    # task 3: create table
    t3 = MySqlOperator(task_id="create_mysql_table",
                       mysql_conn_id="mysql_conn",
                       sql="create_table.sql")

    # task 4: insert cleaned data into table
    t4 = MySqlOperator(task_id="insert_into_table",
                       mysql_conn_id="mysql_conn",
                       sql="insert_into_table.sql",
                       dag=dag)

    # task 5: calculate store-wise and location-wise profit (yesterday) and save results as csv
    t5 = MySqlOperator(task_id="select_from_table",
                       mysql_conn_id="mysql_conn",

示例#24

0

显示文件

api_tse_cand_contas_extraction = DivulgacaoCandContasTSEProcessor(
    db_uri=MONGO_URI, election_year=TSE_YEAR_DATA)

args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(730),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1
}

dag = DAG(dag_id='import_candidate_extra_data',
          default_args=args,
          schedule_interval=None,
          dagrun_timeout=timedelta(minutes=60))

start_import = DummyOperator(task_id='start_import', dag=dag)

import_candidate_extra_data = PythonOperator(
    task_id='import_extra_data_candidate',
    python_callable=api_tse_cand_contas_extraction.run,
    dag=dag)

import_candidate_extra_data.set_upstream(start_import)

if __name__ == '__main__':
    dag.cli()

示例#25

0

显示文件

        data.rename(columns=COLUMNS).to_sql('muertes', con=connection, schema='test', if_exists='append', index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted {len(data.index)}")


dag = DAG('muertos', description='Muertos',
          default_args={
              'owner': 'jsique',
              'depends_on_past': False,
              'max_active_runs': 1,
              'start_date': days_ago(5)
          },
          schedule_interval='0 1 * * *',
          catchup=False)

sensor = FileSensor(task_id="file_sensor_task_muertos",
                    dag=dag,
                    filepath='time_series_covid19_deaths_global.csv',
                    fs_conn_id=FILE_CONNECTION_NAME,
                    poke_interval=10,
                    timeout=600)

etl = PythonOperator(task_id="dead_etl",
                     provide_context=True,
                     python_callable=etl_process,
                     dag=dag
                     )

sensor >> etl

示例#26

0

显示文件

    df = pd.DataFrame.from_dict(records, orient='index')
    df['reviewText'] = df['reviewText'].apply(lambda x: x.replace('|', ''))
    df.to_csv(output_path, sep='|', index=False)


with DAG(
        dag_id="create_fact_review_table",
        schedule_interval="@daily",
        default_args=default_args,
        catchup=False) as dag:


    # Step 1: Unzip and store as csv
    unzip_file_store_as_csv = PythonOperator(
        task_id='unzip_file_store_as_csv',
        python_callable=unzip_to_csv,
        op_kwargs={'input_path': '/usr/local/airflow/dags/files/reviews_Musical_Instruments.json.gz',
                   'output_path': '/usr/local/airflow/dags/files/review_data.csv'}
    )

    # Step 2: Move json file to hdfs storage
    move_to_hdfs = BashOperator(
        task_id="move_to_hdfs",
        bash_command="""
            hdfs dfs -mkdir -p /fact_review && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/review_data.csv /fact_review
            """
    )

    # Step 3: Create a hive table on our sku_data
    creating_fact_table = HiveOperator(
        task_id="creating_fact_review_table",

示例#27

0

显示文件

文件： my_third_dag.py 项目： machieltreffers/airflow-training-skeleton

    if weekday == "Mon":
        person = "email_bob"
    elif weekday == "Wed":
        person = "email_alice"
    elif weekday == "Fri":
        person = "email_joe"
    else:
        person = "unkown"

    return person


print_week_day = PythonOperator(
    task_id="print_week_day",
    python_callable=_get_weekday,
    provide_context=True,
    dag=dag,
)

branching = BranchPythonOperator(task_id="branching",
                                 python_callable=_get_weekday,
                                 provide_context=True,
                                 dag=dag)

join = DummyOperator(task_id="join", trigger_rule="none_failed", dag=dag)

print_week_day >> branching

persons = ["email_bob", "email_alice", "email_joe"]
for person in persons:
    branching >> DummyOperator(task_id=person, dag=dag) >> join

示例#28

0

显示文件

# [END howto_operator_http_task_del_op]
# [START howto_operator_http_http_sensor_check]
task_http_sensor_check = HttpSensor(
    task_id='api_health_check',
    http_conn_id='rest-connection',
    endpoint='/',
    request_params={},
    # response_check=lambda response: "httpbin" in response.text,
    poke_interval=5,
    # on_failure_callback=notify_email,
    dag=dag,
)

# Task 3: Save JSON data locally
# save_and_transform = PythonOperator(
#     task_id="save_and_transform", 
#     python_callable=transform_json,
#     provide_context=True,
# )

save_employee = PythonOperator(
    task_id="save_employee_transform", 
    python_callable=save_emp_json,
    provide_context=True
)

task_http_sensor_check >> task_save_employee >> save_employee >> task_get_byid_employee
save_employee >> task_get_all_employee
# save_employee >> task_update_employee >> task_get_byid_employee

示例#29

0

显示文件

文件： jira_update_dag.py 项目： migachevalexey/UKT_free

import sys
from airflow.operators.python_operator import PythonOperator
from datetime import datetime
from airflow import DAG
# business imports
sys.path.append('/home/airflow/python/')
from cl_cr_update import jira_update

dag = DAG('cl_cr_update',
          description='Run update jira isussues.',
          schedule_interval='*/5 * * * *',
          start_date=datetime(2019, 12, 7), catchup=False)

step_1 = PythonOperator(task_id='step_1',
                        python_callable=jira_update,
                        dag=dag)

示例#30

0

显示文件

文件： dag.py 项目： m-akita-aist/qunomon

        callback_url = conf_json['callback_url']

        input_json_dir = Path(conf_json['input_json_dir'])

        context['ti'].xcom_push(key='mount_volumes', value=mount_volumes)
        context['ti'].xcom_push(key='callback_url', value=callback_url)
        context['ti'].xcom_push(key='input_json_dir', value=input_json_dir)

    def post_process(**context):
        print('start post process.')
        callback_url = context['ti'].xcom_pull(key='callback_url')
        requests.post(callback_url)

    t1 = PythonOperator(
        task_id='pre_process',
        python_callable=pre_process,
        dag=dag
    )

    t3 = PythonOperator(
        task_id='post_process',
        python_callable=post_process,
        trigger_rule=TriggerRule.ALL_DONE, # error発生時でも必ず実行する
        dag=dag
    )

    t2 = DockerOperatorEx(
        task_id='main_process',
        image='qunomon/eval_mnist_data_coverage:0.1',
        docker_url='unix://var/run/docker.sock',
        api_version='auto',