示例#1
0
    'start_date': datetime(2015, 8, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_daily',
          start_date=datetime(2016, 05, 01),
          schedule_interval="0 0 14 * MON-FRI",
          default_args=default_args)

t1 = PythonOperator(task_id='test_airflow',
                    python_callable=test_airflow,
                    dag=dag)

t2 = PythonOperator(task_id='daily_equity_price_ingest',
                    python_callable=daily_equity_price_ingest,
                    dag=dag)

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

t2.set_upstream(t1)

run_this_last.set_upstream(t2)
示例#2
0
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

join = DummyOperator(
    task_id='join',
    trigger_rule='all_done',
    dag=dag
)

sum_up = PythonOperator(
    task_id='sum_up',
    provide_context=True,
    python_callable=sum_up_task,
    dag=dag,
    execution_timeout=timedelta(seconds=60),
    on_failure_callback=notify_failure,
    )

p1.set_upstream(p0)
p2.set_upstream(p0)
p3.set_upstream(p0)
c1.set_upstream(p1)
c2.set_upstream(p2)
c3.set_upstream(p3)
c3.set_downstream(join)
c2.set_downstream(join)
sum_up.set_upstream(join)
示例#3
0
                                  'yearID', 'franchID', 'teamID', 'W', 'L',
                                  'percentage', 'franchName'
                              ],
                              encoding='utf-8')
    conn.insert_rows(table=table_name, rows=results.values.tolist())
    return table_name


dag = DAG('zylo_example',
          schedule_interval=timedelta(hours=1),
          start_date=datetime(2016, 10, 24),
          default_args=default_args)

t1 = PythonOperator(task_id='get_zip_file',
                    provide_context=True,
                    python_callable=get_zip,
                    dag=dag)

t2 = PythonOperator(task_id='get_top_teams',
                    provide_context=True,
                    python_callable=top_teams,
                    dag=dag)

t3 = PythonOperator(task_id='load_to_MySql',
                    provide_context=True,
                    python_callable=bulk_load_teams,
                    op_kwargs={'table_name': 'top_teams'},
                    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t2)
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("bcftools", default_args=default_args,
          schedule_interval=None, concurrency=20000, max_active_runs=20000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

bcftools_task = PythonOperator(
    task_id="bcftools",
    python_callable=bcftools,
    provide_context=True,
    dag=dag)

bcftools_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(bcftools_task)
示例#5
0
t4 = PythonOperator(task_id='python_{}_3'.format(pub_id),
                    python_callable=my_display_function,
                    op_kwargs={'phase': 'EXTRACT_DATA_START'},
                    dag=dag)
t5 = BashOperator(
    task_id='extractdata_{}'.format(pub_id),
    pool='simba_extract_data',
    bash_command=
    'sh /x/home/dm_hdp_batch/test/projects/steam_donkey/scripts/export_processing.sh ',
    dag=dag)
t6 = PythonOperator(task_id='python_{}_4'.format(pub_id),
                    python_callable=my_display_function,
                    op_kwargs={'phase': 'EXTRACT_DATA_END'},
                    dag=dag)
t7 = TriggerDagRunOperator(task_id='trigger_{}_1'.format(pub_id),
                           trigger_dag_id="SUB_{}_{}".format(
                               sub_id, sub_id_ver),
                           python_callable=conditionally_trigger,
                           params={
                               'condition_param': True,
                               'message': 'Hello World'
                           },
                           dag=dag)
t1.set_upstream(t0)
t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t3)
t5.set_upstream(t4)
t6.set_upstream(t5)
t7.set_upstream(t6)
def generate_dag(area, download_dir, default_args):
    """Generate Landsat8 ingestion DAGs.

    Parameters
    ----------
    area: Landsat8Area
        Configuration parameters for the Landsat8 area to be downloaded
    default_args: dict
        Default arguments for all tasks in the DAG.

    """

    dag = DAG(
        LANDSAT8.id + "_{}".format(area.name),
        description="DAG for downloading, processing and ingesting {} AOI in Landsat8 data "
                    "from scene_list".format(area.name),
        default_args=default_args,
        dagrun_timeout=LANDSAT8.dagrun_timeout,
        schedule_interval=LANDSAT8.dag_schedule_interval,
        catchup=LANDSAT8.catchup,
        params={
            "area": area,
        }
    )
    search_task = Landsat8SearchOperator(
        task_id='search_{}'.format(area.name),
        area=area,
        cloud_coverage=LANDSAT8.cloud_coverage,
        startdate = LANDSAT8.startdate,
        enddate = LANDSAT8.enddate,
        filter_max =LANDSAT8.filter_max,
        order_by =LANDSAT8.order_by,
        order_type =LANDSAT8.order_type,
        db_credentials= CFG.landsat8_postgresql_credentials,
        dag=dag
    )
    generate_html_description = Landsat8ProductDescriptionOperator(
        task_id='generate_html_description',
        description_template=os.path.join(
            TEMPLATES_PATH, "product_abstract.html"),
        download_dir=download_dir,
        dag=dag
    )
    download_thumbnail = Landsat8DownloadOperator(
        task_id="download_thumbnail",
        download_dir=download_dir,
        get_inputs_from=search_task.task_id,
        url_fragment="thumb_small.jpg",
        download_max=LANDSAT8.download_max,
        geoserver_rest_url=CFG.geoserver_rest_url,
        geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
        geoserver_username=CFG.geoserver_username,
        geoserver_password=CFG.geoserver_password,
        dag=dag
    )
    generate_thumbnail = Landsat8ThumbnailOperator(
        task_id='generate_thumbnail',
        get_inputs_from=download_thumbnail.task_id,
        thumb_size_x="64",
        thumb_size_y="64",
        dag=dag
    )
    download_metadata = Landsat8DownloadOperator(
        task_id="download_metadata",
        download_dir=download_dir,
        get_inputs_from=search_task.task_id,
        url_fragment="MTL.txt",
        download_max=LANDSAT8.download_max,
        geoserver_rest_url=CFG.geoserver_rest_url,
        geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
        geoserver_username=CFG.geoserver_username,
        geoserver_password=CFG.geoserver_password,
        dag=dag
    )

    join_task = DummyOperator(
        task_id='landsat8_join',
        dag=dag
    )

    download_tasks = []
    translate_tasks = []
    addo_tasks = []
    upload_tasks = []
    gdalinfo_tasks = []

    for band in area.bands:
        download_band = Landsat8DownloadOperator(
            task_id="download_band{}".format(band),
            download_dir=download_dir,
            get_inputs_from=search_task.task_id,
            url_fragment="B{}.TIF".format(band),
            download_max=LANDSAT8.download_max,
            geoserver_rest_url=CFG.geoserver_rest_url,
            geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
            geoserver_username=CFG.geoserver_username,
            geoserver_password=CFG.geoserver_password,
            dag=dag
        )
        download_tasks.append(download_band)

        translate = GDALTranslateOperator(
            task_id="translate_band{}".format(band),
            get_inputs_from=download_band.task_id,
            dag=dag
        )
        translate_tasks.append(translate)

        addo = GDALAddoOperator(
            task_id="add_overviews_band{}".format(band),
            get_inputs_from=translate.task_id,
            resampling_method="average",
            max_overview_level=128,
            compress_overview="PACKBITS",
            dag=dag
        )
        addo_tasks.append(addo)

        gdalinfo = GDALInfoOperator(
            task_id='landsat8_gdalinfo_band_{}'.format(band),
            get_inputs_from=addo.task_id,
            dag=dag
        )
        gdalinfo_tasks.append(gdalinfo)

        upload = RSYNCOperator(
            task_id="upload_band{}".format(band),
            host=CFG.rsync_hostname,
            remote_usr=CFG.rsync_username,
            ssh_key_file=CFG.rsync_ssh_key,
            remote_dir=LANDSAT8.repository_dir,
            get_inputs_from=addo.task_id,
            dag=dag)
        upload_tasks.append(upload)

        download_band.set_upstream(search_task)
        translate.set_upstream(download_band)
        addo.set_upstream(translate)
        gdalinfo.set_upstream(addo)
        upload.set_upstream(addo)
        join_task.set_upstream(upload)
        join_task.set_upstream(gdalinfo)

    download_task_ids = ( task.task_id for task in download_tasks )
    create_original_package_task = PythonOperator(task_id="create_original_package",
                                  python_callable=create_original_package,
                                  op_kwargs={
                                      'get_inputs_from': {
                                          "search_task_id"  : search_task.task_id,
                                          "download_task_ids" : download_task_ids,
                                      }
                                      ,
                                      'out_dir' : LANDSAT8.process_dir
                                  },
                                  dag=dag)

    upload_original_package_task = RSYNCOperator(
        task_id="upload_original_package",
        host=CFG.rsync_hostname,
        remote_usr=CFG.rsync_username,
        ssh_key_file=CFG.rsync_ssh_key,
        remote_dir=LANDSAT8.original_package_upload_dir,
        get_inputs_from=create_original_package_task.task_id,
        dag=dag)

    # we only neeed gdalinfo output on one of the granules
    gdalinfo_task = gdalinfo_tasks[0]
    gdalinfo_task_id = gdalinfo_task.task_id

    upload_task_ids = (task.task_id for task in upload_tasks)
    generate_metadata = Landsat8MTLReaderOperator(
        task_id='generate_metadata',
        original_package_download_base_url = LANDSAT8.original_package_download_base_url,
        gs_workspace = LANDSAT8.geoserver_workspace,
        gs_wms_layer = LANDSAT8.geoserver_layer,
        gs_wms_width = LANDSAT8.geoserver_oseo_wms_width,
        gs_wms_height = LANDSAT8.geoserver_oseo_wms_height,
        gs_wms_format = LANDSAT8.geoserver_oseo_wms_format,
        gs_wms_version = LANDSAT8.geoserver_oseo_wms_version,
        gs_wfs_featuretype = LANDSAT8.geoserver_featuretype,
        gs_wfs_format = LANDSAT8.geoserver_oseo_wfs_format,
        gs_wfs_version=LANDSAT8.geoserver_oseo_wfs_version,
        gs_wcs_scale_i = LANDSAT8.geoserver_oseo_wcs_scale_i,
        gs_wcs_scale_j = LANDSAT8.geoserver_oseo_wcs_scale_j,
        gs_wcs_format = LANDSAT8.geoserver_oseo_wcs_format,
        gs_wcs_version = LANDSAT8.geoserver_oseo_wcs_version,
        gs_wcs_coverage_id = LANDSAT8.geoserver_layer,
        get_inputs_from={
            "search_task_id"  : search_task.task_id,
            "metadata_task_id": download_metadata.task_id,
            "upload_task_ids" : upload_task_ids,
            "gdalinfo_task_id": gdalinfo_task_id,
            "upload_original_package_task_id": upload_original_package_task.task_id,
        },
        metadata_xml_path=os.path.join(TEMPLATES_PATH, "metadata.xml"),
        dag=dag
    )

    product_zip_task = Landsat8ProductZipFileOperator(
        task_id='landsat8_product_zip',
        get_inputs_from=[
            generate_html_description.task_id,
            generate_metadata.task_id,
            generate_thumbnail.task_id
        ],
        output_dir=LANDSAT8.process_dir,
        dag=dag
    )

    # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products"
    publish_task = PythonOperator(task_id="publish_product_task",
                                  python_callable=publish_product,
                                  op_kwargs={
                                      'geoserver_username': CFG.geoserver_username,
                                      'geoserver_password': CFG.geoserver_password,
                                      'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format(
                                          CFG.geoserver_rest_url, LANDSAT8.geoserver_oseo_collection),
                                      'get_inputs_from': product_zip_task.task_id,
                                  },
                                  dag=dag)

    download_thumbnail.set_upstream(search_task)
    download_metadata.set_upstream(search_task)
    for tid in download_tasks:
        create_original_package_task.set_upstream(tid)
    upload_original_package_task.set_upstream(create_original_package_task)
    generate_metadata.set_upstream(join_task)
    generate_metadata.set_upstream(download_metadata)
    generate_metadata.set_upstream(upload_original_package_task)
    generate_thumbnail.set_upstream(download_thumbnail)
    generate_html_description.set_upstream(search_task)
    product_zip_task.set_upstream(generate_html_description)
    product_zip_task.set_upstream(generate_metadata)
    product_zip_task.set_upstream(generate_thumbnail)
    publish_task.set_upstream(upload_original_package_task)
    publish_task.set_upstream(product_zip_task)

    return dag
    python_callable=CheckReadLogs(),
    dag=dag)

put_file = PythonOperator(
    task_id='put-file-to-s3',
    python_callable=DataPutter(),
    dag=dag)

delete_object = PythonOperator(
    task_id='delete-object-from-s3',
    python_callable=DeleteObject(),
    dag=dag)

cleanup = BashOperator(
    task_id='cleanup',
    bash_command=rm_file,
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)



get_file.set_upstream(put_file)
hello_world_docker_write_logs.set_upstream(get_file)
check_read_logs.set_upstream(hello_world_docker_write_logs)
cleanup.set_upstream(check_read_logs)
cleanup.set_upstream(get_file)
delete_object.set_upstream(get_file)



示例#8
0
    subject='Latest popular links',
    html_content='Check out the latest!!',
    files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
    dag=dag)

sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)

clear_latest = BashOperator(
    bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR),
    task_id='clear_latest',
    dag=dag)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={'query': term})
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
    task_id='hive_s3_location',
    command_type="hivecmd",
    script_location="s3n://dev.canopydata.com/airflow/show_table.hql",
    notfiy=True,
    tags=['tag1', 'tag2'],
    trigger_rule="all_done",
    dag=dag)

t3 = PythonOperator(
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)


join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
示例#10
0
          default_args=default_args,
          schedule_interval=None,
          concurrency=10000,
          max_active_runs=2000)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

validate_sample_task = PythonOperator(task_id="validate_sample",
                                      python_callable=validate_sample,
                                      provide_context=True,
                                      dag=dag)

validate_sample_task.set_upstream(start_analysis_run_task)

delly_task = PythonOperator(task_id="delly_genotype",
                            python_callable=run_delly,
                            provide_context=True,
                            dag=dag)

delly_task.set_upstream(validate_sample_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(delly_task)
示例#11
0
                            dag=dag)

deliver_res_op = """
cp -r {{ params.project_dir }}/{{ params.dag_id }}_results/* {{ params.project_dir }}/results
"""

deliver_res = BashOperator(task_id='Deliver_result',
                           bash_command=deliver_res_op,
                           params={
                               'project_dir': project_directory,
                               'dag_id': dag_id
                           },
                           dag=dag)

preprosessing.set_upstream(clean_up)
ms_concatenation.set_upstream(preprosessing)
clustering_or.set_upstream(ms_concatenation)
taxo_assignation.set_upstream(clustering_or)
biom_generation.set_upstream(clustering_or)
biom_generation.set_upstream(taxo_assignation)
tree_generation.set_upstream(clustering_or)
filter_weak_otus.set_upstream(biom_generation)
biom_conversion.set_upstream(filter_weak_otus)
raw_matrix_generation.set_upstream(biom_conversion)
matrix_normalization.set_upstream(raw_matrix_generation)
matrix_consolidation.set_upstream(matrix_normalization)
output_res.set_upstream(ms_concatenation)
output_res.set_upstream(tree_generation)
output_res.set_upstream(biom_conversion)
output_res.set_upstream(raw_matrix_generation)
output_res.set_upstream(matrix_consolidation)
示例#12
0

start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)


validate_sample_task = PythonOperator(
    task_id="validate_sample",
    python_callable=validate_sample,
    provide_context=True,
    dag=dag)

validate_sample_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

for contig_name in tracker.util.workflow_common.CONTIG_NAMES:
    freebayes_task = PythonOperator(
        task_id="freebayes_" + contig_name,
        python_callable=run_freebayes,
        op_kwargs={"contig_name": contig_name},
        provide_context=True,
        dag=dag)
示例#13
0
    # We want subprocess output to bypass logging module otherwise multiline
    # output is squashed together.
    util.run(args, use_print=True, dryrun=dryrun)


build_op = PythonOperator(task_id='build_images',
                          provide_context=True,
                          python_callable=build_images,
                          dag=dag)

setup_cluster_op = PythonOperator(task_id='setup_cluster',
                                  provide_context=True,
                                  python_callable=setup_cluster,
                                  dag=dag)

setup_cluster_op.set_upstream(build_op)

run_tests_op = PythonOperator(task_id='run_tests',
                              provide_context=True,
                              python_callable=run_tests,
                              dag=dag)

run_tests_op.set_upstream(setup_cluster_op)

teardown_cluster_op = PythonOperator(task_id='teardown_cluster',
                                     provide_context=True,
                                     python_callable=teardown_cluster,
                                     dag=dag)

teardown_cluster_op.set_upstream(run_tests_op)
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes,
               default_args):

    dag = DAG(dag_id,
              schedule_interval=schedule,
              start_date=start_date,
              default_args=default_args)

    dag.doc_md = """
    # DAG fetching data from smiles.com.ar
    ### procesing and dumping on postgresql
    """
    """start = TimeDeltaSensor(
        task_id='wait_to_start',
        delta=timedelta(minutes=delta_sensor),
        dag=dag)"""

    start = DummyOperator(task_id="start", dag=dag)

    branches = []

    def return_dates_branches(**kwargs):
        return branches

    gen_url_branch = BranchPythonOperator(
        task_id='generate_url_dates',
        provide_context=True,
        python_callable=return_dates_branches,
        dag=dag)

    def transform_data(**kwargs):
        ti = kwargs['ti']
        raw_data = ti.xcom_pull(task_ids=return_dates_branches())
        data = []
        logging.info(raw_data)
        if raw_data is not None:
            flat_list = [item for sublist in raw_data for item in sublist]
            for row in flat_list:
                row = list(row)
                # add À-ÿ for spanish accents
                date = '/'.join(
                    list(
                        re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split(
                            row[1]))[2:4])
                date = dateparser.parse(date,
                                        languages=['pt', 'es'],
                                        date_formats=['%d/%b'
                                                      ]).strftime('%Y-%m-%d')
                row[1] = date
                td = row[4].split(':')
                row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1])))
                row[5] = int(row[5].replace('.', ''))
                row[6] = int(row[6].replace('.', ''))
                row[8] = row[8].split(' ')[-1]
                row.insert(0, datetime.now().strftime('%Y-%m-%d'))
                data.append(tuple(row))
            return data
        else:
            print('No se recibio datos')

    t2 = PythonOperator(
        task_id='transform_data',
        python_callable=transform_data,
        depends_on_past=True,
        trigger_rule=TriggerRule.ALL_SUCCESS,
        provide_context=True,
        dag=dag,
    )

    t2.doc_md = """
    #### Task Documentation
    Transform fetched data
    @return a list of tuples
    """

    # def gen_url_dates(**kwargs):
    date_start = read_scraped_date(airpots_codes)
    date_end = date_start + timedelta(days=AMOUNT_DAYS)
    date_generated = [
        date_start + timedelta(days=x)
        for x in range(0, (date_end - date_start).days)
    ]

    for i, date in enumerate(date_generated):
        date_ml = str(date.timestamp())[:8] + '00000'
        url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3&currencyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format(
            airpots_codes[0][0], airpots_codes[1], date_ml, date_ml,
            airpots_codes[0][1], airpots_codes[1])

        get_data_op = PythonOperator(
            task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0],
                                                     airpots_codes[0][1],
                                                     airpots_codes[1], i),
            python_callable=get_data_URL,
            op_kwargs={'URL': url_dated},
            trigger_rule=TriggerRule.ONE_SUCCESS,
            provide_context=True,
            dag=dag,
        )
        branches.append(get_data_op.task_id)
        get_data_op.set_upstream(gen_url_branch)
        get_data_op.set_downstream(t2)
        get_data_op.doc_md = """
        #### Task Documentation
        Fetch data from passed url
        return list of semi-parsed data
        """

    insert_data = PythonOperator(
        task_id='insert_data',
        python_callable=insert_into_table,
        provide_context=True,
        dag=dag,
    )

    insert_data.doc_md = """
    #### Task Documentation
    Insert parsed and transformed data into table
    """
    t2.set_downstream(insert_data)
    gen_url_branch.set_upstream(start)

    return dag
示例#15
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("test", default_args=default_args,
          schedule_interval=None, concurrency=20000, max_active_runs=20000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

test_task = PythonOperator(
    task_id="test",
    python_callable=run_test,
    provide_context=True,
    dag=dag)

test_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(test_task)
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("sanger_variant_calling",
          default_args=default_args,
          schedule_interval=None,
          concurrency=500,
          max_active_runs=500)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

run_sanger_callers_task = PythonOperator(task_id="run_sanger_callers",
                                         python_callable=run_sanger_callers,
                                         provide_context=True,
                                         dag=dag)

run_sanger_callers_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(run_sanger_callers_task)
示例#17
0
                            max_overview_level=MAX_OVERVIEW_LEVEL,
                            task_id='gdal_addo_' + str(i),
                            get_inputs_from=warp.task_id,
                            dag=dag)
    addo_tasks.append(addo)

    upload = RSYNCOperator(task_id="upload_granule_{}_task".format(str(i)),
                           host=CFG.rsync_hostname,
                           remote_usr=CFG.rsync_username,
                           ssh_key_file=CFG.rsync_ssh_key,
                           remote_dir=S1GRD1SDV.repository_dir,
                           get_inputs_from=addo.task_id,
                           dag=dag)
    upload_tasks.append(upload)

    band_paths.set_upstream(zip_task)
    warp.set_upstream(band_paths)
    addo.set_upstream(warp)
    upload.set_upstream(addo)

# Metadata Extraction task
addo_task_ids = (task.task_id for task in addo_tasks)
upload_task_ids = (task.task_id for task in upload_tasks)
metadata_task = S1MetadataOperator(
    task_id="extract_metadata_task",
    product_safe_path=None,
    granules_paths=None,
    granules_upload_dir=S1GRD1SDV.repository_dir,
    processing_dir=S1GRD1SDV.process_dir,
    original_package_download_base_url=S1GRD1SDV.
    original_package_download_base_url,
示例#18
0
args = {
    'owner': 'airflow',
    'start_date': datetime.now(),
}

dag = DAG(dag_id='my_first_dag', default_args=args, schedule_interval=None)


def print_context(i):
    print(i)
    return 'print_context has sucess {}'.format(i)


parent = None
for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = \
        PythonOperator(
            task_id='print_the_context.{}'.format(i),
            python_callable=print_context,
            op_kwargs={'i': i},
            dag=dag)

    if parent:
        task.set_upstream(parent)

    parent = task
示例#19
0
t1 = PythonOperator(
    task_id='clear_scrape_folder',
    python_callable=clear_folder,
    dag=dag)

# TODO properly import python classes
t2 = BashOperator(
    task_id='scrape_profile_images',
    bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper),
    dag=dag)

t3 = PythonOperator(
    task_id='scrape_progress',
    python_callable=print_scrape_in_progress,
    dag=dag)

t4 = BashOperator(
    task_id='create_landmarks',
    bash_command='cd {} && python landmark.py'.format(averageface_path),
    dag=dag)

t5 = BashOperator(
    task_id='create_average_face',
    bash_command='cd {} && python averageface.py'.format(averageface_path),
    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
t4.set_upstream(t2)
t4.set_upstream(t3)
t5.set_upstream(t4)
示例#20
0
          default_args=default_args,
          schedule_interval=None,
          concurrency=10000,
          max_active_runs=2000)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

validate_sample_task = PythonOperator(task_id="validate_sample",
                                      python_callable=validate_sample,
                                      provide_context=True,
                                      dag=dag)

validate_sample_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

for contig_name in tracker.util.workflow_common.CONTIG_NAMES:
    freebayes_task = PythonOperator(task_id="freebayes_" + contig_name,
                                    python_callable=run_freebayes,
                                    op_kwargs={"contig_name": contig_name},
                                    provide_context=True,
                                    dag=dag)

    freebayes_task.set_upstream(validate_sample_task)
示例#21
0

def done(**_kwargs):
    logging.info("Executing done step.")


clone_op = PythonOperator(task_id='clone_repo',
                          provide_context=True,
                          python_callable=clone_repo,
                          dag=dag)

build_op = PythonOperator(task_id='build_images',
                          provide_context=True,
                          python_callable=build_images,
                          dag=dag)
build_op.set_upstream(clone_op)

py_lint_op = PythonOperator(task_id='pylint',
                            provide_context=True,
                            python_callable=py_checks_gen("lint"),
                            dag=dag)
py_lint_op.set_upstream(clone_op)

py_test_op = PythonOperator(task_id='pytest',
                            provide_context=True,
                            python_callable=py_checks_gen("test"),
                            dag=dag)
py_test_op.set_upstream(clone_op)

setup_cluster_op = PythonOperator(task_id='setup_cluster',
                                  provide_context=True,
示例#22
0
fetch_tweets = PythonOperator(
    task_id='fetch_tweets',
    python_callable=fetchtweets,
    dag=dag)

# --------------------------------------------------------------------------------
# Clean the eight files. In this step you can get rid of or cherry pick columns
# and different parts of the text
# --------------------------------------------------------------------------------

clean_tweets = PythonOperator(
    task_id='clean_tweets',
    python_callable=cleantweets,
    dag=dag)

clean_tweets.set_upstream(fetch_tweets)

# --------------------------------------------------------------------------------
# In this section you can use a script to analyze the twitter data. Could simply
# be a sentiment analysis through algorithms like bag of words or something more
# complicated. You can also take a look at Web Services to do such tasks
# --------------------------------------------------------------------------------

analyze_tweets = PythonOperator(
    task_id='analyze_tweets',
    python_callable=analyzetweets,
    dag=dag)

analyze_tweets.set_upstream(clean_tweets)

# --------------------------------------------------------------------------------
示例#23
0
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("filter-vcf", default_args=default_args,
          schedule_interval=None, concurrency=20000, max_active_runs=20000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)



filter_task = PythonOperator(
    task_id="filter_variants",
    python_callable=filter_variants,
    provide_context=True,
    dag=dag)

filter_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(filter_task)
        }
    }, {
        '$out': tmp_created_collection_per_hour_name
    }]
    results = db.logs.aggregate(pipeline)
    print("Aggregated hour metrics")
    return 'Whatever you return gets printed in the logs'


run_this = PythonOperator(task_id='connect_to_mongodb_and_aggregate_day',
                          provide_context=True,
                          python_callable=connect_to_mongodb_and_aggregate_day,
                          dag=dag)

run_this_also = PythonOperator(
    task_id='connect_to_mongodb_and_aggregate_hour',
    provide_context=True,
    python_callable=connect_to_mongodb_and_aggregate_hour,
    dag=dag)

run_this_also.set_upstream(run_this)

send_email_notification_flow_successful = EmailOperator(
    task_id='send_email_notification_flow_successful',
    to="*****@*****.**",
    subject='custom email from airflow',
    html_content="{{ params['foo'](execution_date) }}",
    params=params,
    dag=dag)

send_email_notification_flow_successful.set_upstream(run_this_also)
simple_search = PythonOperator(task_id='search_twitter',
                               provide_context=True,
                               python_callable=search_twitter,
                               dag=dag,
                               params={'query': '#python'})


move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite',
                                       provide_context=True,
                                       python_callable=csv_to_sqlite,
                                       dag=dag)


id_popular = PythonOperator(task_id='identify_popular_links',
                            provide_context=True,
                            python_callable=identify_popular_links,
                            dag=dag)


email_links = EmailOperator(task_id='email_best_links',
                            to='*****@*****.**',
                            subject='Latest popular links',
                            html_content='Check out the latest!!',
                            files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
                            dag=dag)


simple_search.set_downstream(move_tweets_to_sqlite)
id_popular.set_upstream(move_tweets_to_sqlite)
email_links.set_upstream(id_popular)
                            html_content='Check out the latest!!',
                            files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
                            dag=dag)


sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)


clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format(
    RAW_TWEET_DIR), task_id='clear_latest', dag=dag)


gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={'query': term})
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
示例#27
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("bcftools",
          default_args=default_args,
          schedule_interval=None,
          concurrency=20000,
          max_active_runs=20000)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

bcftools_task = PythonOperator(task_id="bcftools",
                               python_callable=bcftools,
                               provide_context=True,
                               dag=dag)

bcftools_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(bcftools_task)
示例#28
0
    task_id='setup_jobs',
    provide_context=True,
    python_callable=setup_jobs_fn,
    dag=dag)


def collect_results_fn(ds, **kwargs):
    pprint(kwargs)
    print(ds)


collect_results = PythonOperator(
    task_id='collect_results',
    provide_context=True,
    python_callable=collect_results_fn,
    dag=dag)


for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': float(i)/10},
        dag=dag)
    task.set_upstream(setup_jobs)
    task.set_downstream(collect_results)
示例#29
0
  'email_on_retry': False
}


# Set concurrency and max_active_runs to 1, preventing more than one dag instance
# from being created.
dag = DAG(dag_name, default_args=task_args,
          concurrency=1,
          max_active_runs=1,
          schedule_interval=schedule_interval)


get_env = PythonOperator(
    task_id='get-config-from-s3',
    python_callable=ConfigGetter(),
    dag=dag)

set_variables = PythonOperator(
    task_id='set-variables',
    python_callable=BootStrapper(),
    dag=dag)

cleanup = BashOperator(
    task_id='cleanup',
    bash_command=rm_config,
    trigger_rule='all_done',
    dag=dag)


set_variables.set_upstream(get_env)
cleanup.set_upstream(set_variables)
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("msisensor",
          default_args=default_args,
          schedule_interal=None,
          concurrency=10000,
          max_active_runs=2000)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

msisensor_task = PythonOperator(task_id='msisensor',
                                python_callable=run_msisensor,
                                provide_context=True,
                                dag=dag)

msisensor_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run_task,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(msisensor_task)
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("sanger_bwa", default_args=default_args,
          schedule_interval=None, concurrency=500, max_active_runs=500)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

run_bwa_task = PythonOperator(
    task_id="run_bwa",
    python_callable=run_bwa,
    provide_context=True,
    dag=dag)

run_bwa_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(run_bwa_task)
示例#32
0
    if table_exists:
        sqls = [drop_table, create_table, load_data]
        for i in sqls:
            redshift_call(i)
    else:
        sqls = [create_table, load_data]
        for i in sqls:
            redshift_call(i)

postgres_to_local_csv = PythonOperator(
    task_id='postgres_to_local_csv',
    provide_context=True,
    python_callable=get_orders_with_bellhops,
    dag=dag)

local_csv_to_s3 = PythonOperator(
    task_id='local_csv_to_s3',
    provide_context=True,
    python_callable=store_orders_with_bellhops,
    dag=dag)

s3_to_redshift = PythonOperator(
    task_id='s3_to_redshift',
    provide_context=True,
    python_callable=transfer_orders_with_bellhops,
    dag=dag) 

local_csv_to_s3.set_upstream(postgres_to_local_csv)
s3_to_redshift.set_upstream(local_csv_to_s3)
    
def print_hello_world():
    print('this_should_print_hello_world from python')


# Following are defaults which can be overridden later on
default_args = {
    'owner': 'Jackie G',
    'depends_on_past': False,
    'start_date': datetime(2016, 4, 15),
    'email': ['jackies-email'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('Helloworld', default_args=default_args)

t1 = BashOperator(
    task_id='hello_from_bash',
    bash_command='echo "Task 1 says hello"',
    dag=dag)

t2 = PythonOperator(
    task_id='hello_from_python',
    python_callable=print_hello_world,
    dag=dag)

t2.set_upstream(t1)
示例#34
0
cluster_id="{{ ti.xcom_pull(key="emr_cluster_id", task_ids="clean_emr_id") }}"
echo $cluster_id
aws emr add-steps --cluster-id $cluster_id --steps Type=spark,Name=pyspark_job,\
Jar="command-runner.jar",\
Args=[\
--deploy-mode,client,\
s3://$bc/sparky.py\
],ActionOnFailure=TERMINATE_CLUSTER
'''
# .format(bucket_pyton, driver_cores, driver_memory, executor_memory, executor_cores)

start_emr = BashOperator(task_id='start_emr',
                         bash_command=start_emr,
                         provide_context=True,
                         xcom_push=True,
                         params={"bucket_log": bucket_log},
                         dag=dag)

clean_emr_id = PythonOperator(task_id='clean_emr_id',
                              python_callable=parse_emr_id,
                              provide_context=True,
                              dag=dag)

add_step = BashOperator(task_id='add_step',
                        bash_command=add_step,
                        provide_context=True,
                        dag=dag)

add_step.set_upstream(clean_emr_id)
clean_emr_id.set_upstream(start_emr)
t2 = PythonOperator(task_id='Twitter_Authorisation',
                    python_callable=maintts,
                    dag=dag)
t3 = PythonOperator(task_id='Fetching_Data',
                    python_callable=fetchsamples,
                    dag=dag)

t4 = PythonOperator(task_id='RSA_Key_256SHA', python_callable=rsakey, dag=dag)
t5 = PythonOperator(task_id='Directory_List', python_callable=dirlis, dag=dag)

t6 = PythonOperator(task_id='Face_Detection', python_callable=faceid, dag=dag)
t7 = PythonOperator(task_id='Video_Capture', python_callable=vidcap, dag=dag)

t8 = PythonOperator(task_id='Read_Validate_Json',
                    python_callable=read,
                    dag=dag)
t9 = PythonOperator(task_id='TOP_10_Places', python_callable=top_10, dag=dag)

tf = PythonOperator(task_id='End_Point', python_callable=end, dag=dag)

t2.set_upstream(t1)
t4.set_upstream(t1)
t3.set_upstream(t2)
t5.set_upstream(t4)
t8.set_upstream(t3)
t9.set_upstream(t8)
t6.set_upstream(t5)
t7.set_upstream(t6)
tf.set_upstream(t7, t9)
示例#36
0
dag = DAG("example", default_args=default_args, schedule_interval=timedelta(1))

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id="templated",
    bash_command=templated_command,
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)

t4 = PythonOperator(task_id="python_code",
                    python_callable=example_function,
                    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
t4.set_upstream(t1)
示例#37
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("pcawg_bwa",
          default_args=default_args,
          schedule_interval=None,
          concurrency=50,
          max_active_runs=50)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

run_bwa_task = PythonOperator(task_id="run_bwa",
                              python_callable=run_bwa,
                              provide_context=True,
                              dag=dag)

run_bwa_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(run_bwa_task)
def connect_to_mongodb_and_aggregate_hour(ds, **kwargs):
    db = MongoClient().test
    tmp_created_collection_per_hour_name = 'page_per_hour_hits_tmp';
    pipeline = [{"$project":{'page': '$PAGE', 'time': { 'y': {'$year':'$DATE'} , 'm':{'$month':'$DATE'}, 'day':{'$dayOfMonth':'$DATE'}, 'h':{'$hour':'$DATE'}}}}, {'$group':{'_id':{'p':'$page','y':'$time.y','m':'$time.m','d':'$time.day', 'h':'$time.h'}, 'hourly':{'$sum':1}}},{'$out': tmp_created_collection_per_hour_name}]
    results = db.logs.aggregate(pipeline)
    print("Aggregated hour metrics")
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='connect_to_mongodb_and_aggregate_day',
    provide_context=True,
    python_callable=connect_to_mongodb_and_aggregate_day,
    dag=dag)

run_this_also = PythonOperator(
    task_id='connect_to_mongodb_and_aggregate_hour',
    provide_context=True,
    python_callable=connect_to_mongodb_and_aggregate_hour,
    dag=dag)

run_this_also.set_upstream(run_this)

send_email_notification_flow_successful = EmailOperator(
    task_id='send_email_notification_flow_successful',
    to="*****@*****.**",
    subject='custom email from airflow',
    html_content="{{ params['foo'](execution_date) }}",
    params=params,
    dag=dag)

send_email_notification_flow_successful.set_upstream(run_this_also)
示例#39
0
""" Simple subdag example """
from airflow import DAG
from airflow.operators import PythonOperator
from twitter_airflow import csv_to_sqlite, identify_popular_links
from datetime import datetime, timedelta

default_args = {
    'owner': 'admin',
    'depends_on_past': False,
    'start_date': datetime(2016, 1, 1),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

subdag = DAG('generate_twitter_dags.insert_and_id_pop',
             default_args=default_args)

move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite',
                                       provide_context=True,
                                       python_callable=csv_to_sqlite,
                                       dag=subdag)

id_popular = PythonOperator(task_id='identify_popular_links',
                            provide_context=True,
                            python_callable=identify_popular_links,
                            dag=subdag,
                            params={'write_mode': 'a'})

id_popular.set_upstream(move_tweets_to_sqlite)
示例#40
0

start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)


validate_sample_task = PythonOperator(
    task_id="validate_sample",
    python_callable=validate_sample,
    provide_context=True,
    dag=dag)

validate_sample_task.set_upstream(start_analysis_run_task)

delly_task = PythonOperator(
    task_id="delly_genotype",
    python_callable=run_delly,
    provide_context=True,
    dag=dag)

delly_task.set_upstream(validate_sample_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)
def my_sleeping_function(random_base):
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def connect_to_mongodb_and_print(ds, **kwargs):
    db = MongoClient().zips
    buildinfo = db.command("buildinfo")
    print(buildinfo)
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='connect_to_mongodb_and_print',
    provide_context=True,
    python_callable=connect_to_mongodb_and_print,
    dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': i},
        dag=dag)

    task.set_upstream(run_this)
示例#42
0
                    if res is not None and len(res) > 0:
                        category_id = res[0]

                        sql = """
                            insert into alpha.notecard_categories (notecard_id, category_id)
                            values (%s,%s)
                        """

                        cur.execute(sql, (notecard_id, category_id))
                        conn.commit()

    # move file to process folder upon completion
    shutil.move(os.path.join(filepath, file), os.path.join(destination, file))

    return True

populate_task = PythonOperator(
    task_id='populate_csv',
    provide_context=True,
    depends_on_past=True,
    python_callable=populate_db,
    dag=dag)

# trigger = TriggerDagRunOperator(
#     task_id='trigger_dag_rerun',
#     trigger_dag_id=task_name,
#     dag=dag)

populate_task.set_upstream(sensor_task)
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def connect_to_monary_and_print_aggregation(ds, **kwargs):
    m = Monary()
    pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}]
    states, population = m.aggregate("zips", "data", pipeline, ["_id", "totPop"], ["string:2", "int64"])
    strs = list(map(lambda x: x.decode("utf-8"), states))
    result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population))
    print (result)
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='connect_to_monary_and_print_aggregation',
    provide_context=True,
    python_callable=connect_to_monary_and_print_aggregation,
    dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': i},
        dag=dag)
    task.set_upstream(run_this)
示例#44
0
          default_args=default_args,
          schedule_interval=None,
          concurrency=50,
          max_active_runs=50)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

metadata_task = PythonOperator(task_id="prepare_metadata",
                               python_callable=prepare_metadata,
                               provide_context=True,
                               dag=dag)

metadata_task.set_upstream(start_analysis_run_task)

cgsubmit_task = PythonOperator(task_id="submit_metadata",
                               python_callable=submit_metadata,
                               provide_context=True,
                               dag=dag)

cgsubmit_task.set_upstream(metadata_task)

gtupload_task = PythonOperator(task_id="upload_sample",
                               python_callable=upload_sample,
                               provide_context=True,
                               dag=dag)

gtupload_task.set_upstream(cgsubmit_task)
示例#45
0
                                 'out_dirpath':
                                 './openeo_job/result/'
                             }, {
                                 'name': 'save_raster'
                             }, {
                                 'name': 'get_cube_metadata'
                             }, {
                                 'name':
                                 'to_pickle',
                                 'filepath':
                                 './openeo_job/result/save_13.dc;str'
                             }]
                         },
                         queue='process')

nir_2.set_upstream([dc_0])

red_3.set_upstream([dc_0])

blue_4.set_upstream([dc_0])

sub_5.set_upstream([nir_2, red_3])

p1_6.set_upstream([red_3])

p2_7.set_upstream([blue_4])

sum_8.set_upstream([nir_2, p1_6, p2_7])

div_9.set_upstream([sub_5, sum_8])
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("sanger_variant_calling", default_args=default_args,
          schedule_interval=None, concurrency=500, max_active_runs=500)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

run_sanger_callers_task = PythonOperator(
    task_id="run_sanger_callers",
    python_callable=run_sanger_callers,
    provide_context=True,
    dag=dag)

run_sanger_callers_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(run_sanger_callers_task)