,catchup = False ) start_task = DummyOperator( task_id='start_task', dag=main_dag ) # Defining task to fetch cdc min dates and datefield references and download them to local instance cdc_times_to_local_instance = PythonOperator( task_id = 'cdc_times_to_local_instance', python_callable = functs.cdc_times_to_local_instance, op_kwargs={ 'cdc_times_and_dates_folder_local_location': cdc_times_and_dates_folder_local_location, 'snowflake_username': snowflake_username, 'snowflake_password': snowflake_password, 'snowflake_account': snowflake_account, 'snowflake_database': snowflake_database, 'snowflake_stage_schema': snowflake_stage_schema }, dag = main_dag ) # Creates the tasks dynamically. Each one will elaborate one chunk of data. def create_dynamic_task_tos3(database_name, table): return PythonOperator( task_id='upload_to_S3_task_' + database_name + '_' + table, python_callable=functs.upload_table_to_S3_with_hook_v2, pool = 'massive_pool', op_kwargs={ 'Source_System_Name': Source_System_Name,
# DAG-level settings. with DAG(dag_id=dag_name, schedule_interval='@daily', start_date=datetime.strptime('2020-04-07 00:00:00', "%Y-%m-%d %H:%M:%S"), max_active_runs=1, concurrency=1, default_args={ 'project_id': 'silicon-parity-282607', 'email': '*****@*****.**', 'email_on_failure': True, 'email_on_retry': False }) as dag: push_cluster_name = PythonOperator(dag=dag, task_id="push-cluster-name", provide_context=True, python_callable=push_cluster_name) # The task of creating a cluster. dataproc_create_cluster = DataprocClusterCreateOperator( task_id='dataproc-create-cluster', project_id='silicon-parity-282607', region='us-central1', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2', cluster_name= '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}', num_workers=2) # The task of running the Spark job. dataproc_spark_process = DataProcSparkOperator(
""" Checks whether Zip is installed. :return: True if it is installed, False if not. :rtype: bool """ return_code = os.system("zip") if return_code != 0: raise SystemError("The zip binary is missing") def hello(): print("Hello") # You don't have to use any special KubernetesExecutor configuration if you don't want to start_task = PythonOperator( task_id="start_task", python_callable=hello, executor_config={"KubernetesExecutor": {"image": "apache/airflow:master-ci" }} ) # But you can if you want to one_task = PythonOperator( task_id="one_task", python_callable=hello, executor_config={"KubernetesExecutor": {"image": "apache/airflow:master-ci" }} ) # Use the zip binary, which is only found in this special docker image # two_task = PythonOperator( # task_id="two_task", # python_callable=use_zip_binary, # executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}}
'start_date': datetime(2019, 1, 1), 'retry_delay': timedelta(minutes=5) } # Using the context manager alllows you not to duplicate the dag parameter in each operator dag = DAG('stock_historic_etl_dag', default_args=default_args, schedule_interval='@once') start_task = DummyOperator(task_id='dummy_start', dag=dag) upload_historic_news_to_S3_task = PythonOperator( task_id='upload_historic_news_to_S3', python_callable=upload_file_to_S3_with_hook, op_kwargs={ 'filename': '/root/airflow/dags/download/stocknews.zip', 'key': 'raw-historic-data/stocknews.zip', 'bucket_name': 'stock.etl', }, dag=dag) upload_historic_pricing_to_S3_task = PythonOperator( task_id='upload_historic_pricing_to_S3', python_callable=upload_file_to_S3_with_hook, op_kwargs={ 'filename': '/root/airflow/dags/download/price-volume-data-for-all-us-stocks-etfs.zip', 'key': 'raw-historic-data/stockpricing.zip', 'bucket_name': 'stock.etl', }, dag=dag)
def etl(): insert_to_db_current_date() # Following are defaults which can be overridden later on default_args = { 'owner': 'kaidokariste', 'depends_on_past': False, 'start_date': datetime(2021, 1, 10), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': timedelta(minutes=1), } # Define the DAG so it runs on a daily basis, after every 5 minutes dag = DAG(dag_id="home_pc_insert", default_args=default_args, schedule_interval="*/5 * * * *") # Make sure `etl()` is called in the operator. Pass the correct kwargs. task_recommendations = PythonOperator( task_id="test_insert_task", python_callable=etl, dag=dag #op_kwargs={"db_engines": db_engines}, )
def process_utilization_kpi( parent_dag_name, child_dag_name, start_date, schedule_interval, celery_queue, technology, devices, attributes ): site_names = devices.keys() machine_names = list(set([each_site.split('_')[0] for each_site in devices.keys()])) #logging.info("Site names by utilization subdag(%s): %s\n"%(technology,site_names)) utilization_kpi_by_technology = DAG( dag_id="%s.%s" % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, start_date=start_date, ) def evaluate_condition(rules,current_value): result = 'False' result_all = [] for i in range(1,len(rules),2): threshold_value = rules[i].get('value') #get threshold from rules dict operator = rules[i].get('operator') #get operator from rules service_name = rules[i].get('name') symbol = operators.get(operator) #get symbol from dict if threshold_value != '' and current_value != '': #logging.info("\n Evaluating ") #logging.info("Evaluating the Formula ---> %s%s%s of %s as %s"%(str(current_value),str(symbol),str(threshold_value) , str(service_name) ,eval("%s%s%s"%(float(current_value),str(symbol),float(threshold_value))))) try: if eval("%s%s%s"%(float(current_value),str(symbol),float(threshold_value))): result_all.append('True') else: result_all.append('False') except (NameError, SyntaxError, TypeError, ValueError): if eval('\''+str(current_value)+'\''+symbol+'\''+str(threshold_value)+'\''): result_all.append('True') else: result_all.append('False') except Exception: logging.info("Some WTF Exception") if eval('\''+str(current_value)+'\''+symbol+'\''+str(threshold_value)+'\''): result_all.append('True') else: result_all.append('False') else: result_all.append('False') try: #logging.info(rules) #logging.info("i="+str(i)) if rules[i+1] == 'AND' or rules[i+1] == 'OR' and rules[i+1] != None: result_all.append(rules[i+1].lower()) except IndexError: #logging.info('No Conjugator or the rule ended') continue #logging.info("The Result of %s After compiling booleans ---> %s"%(str(service_name),str(result_all))) if len(result_all) == 1: result = eval(result_all[0]) elif len(result_all) % 2 != 0: result = eval(" ".join(result_all)) else: logging.info("Please Check the syntax of rules") #logging.info("returning ; %s"%str(result)) return result def calculate_severity(service,cur,host_state="",ds=""): final_severity = [] global rules if not (ds == "pl" and host_state == "down"): #TODO: Currently using loop to get teh Dic value can get hashed value provided the total severity for the devices remain fixed need to consult try: total_severities = rules.get(service) #TODO: Handle if service not found total_severities_len = len(total_severities) #Severity 1 will be the first to checked and should be the top priority i.e if except TypeError: #logging.info("The specified service "+service+" does not have a rule specified in rules variable") return 'unknown' for i in range(1,total_severities_len+1): current_severity = "" sv_rules = total_severities.get("Severity"+str(i)) if sv_rules[0]: current_severity = sv_rules[0] else: current_severity = 'unknown' logging.warning("Please provide severity name for " + str(service)) result = evaluate_condition(sv_rules,cur) if result: return current_severity #final_severity = final_severity.append(evaluate_condition(rules,cur)) #Later can be used to get all the SEV and then based upon priority decide Severity #logging.info("The Final Result for Service "+service+" is " + str(result) +" having Val "+ str(cur) +" and Severity : "+ str(current_severity)) else: continue elif host_state=="down" and ds == "pl": return host_state else: return "up" if (ds == "pl" or ds == "rta"): return 'up' else: return "ok" #only required for UP and Down servereties of network devices def age_since_last_state(host, service, state, memc_conn): prefix = 'util:' key = prefix + host + ':' + service out = memc_conn.get(str(key)) set_key = True timestamp = datetime.now() now = (timestamp + timedelta(minutes=-(timestamp.minute % 5))).replace(second=0, microsecond=0) now = now.strftime('%s') age = now value = state + ',' + age if out: out = out.split(',') old_state = out[0] time_since = out[1] if old_state == state: set_key = False age = time_since if set_key: memc_conn.set(str(key), value) return int(age) def get_severity_values(service): global rules all_sev = rules.get(service) sev_values = [] for i in range(1,len(all_sev)+1): sev_values.append(all_sev.get("Severity"+str(i))[1].get("value")) return sev_values def extract_utilization_kpi(**kwargs): params = kwargs.get('params') technology = params.get('technology') devices = params.get('devices') memc_conn = params.get('memc_conn') redis_conn = params.get('redis_conn') site_name = params.get('site_name') machine_name = params.get('machine_name') attributes = params.get('attributes') slot_number = params.get('slot_number') service_name = attributes.get('service_name') utilization_type = attributes.get('utilization_type') utilization_key = attributes.get('utilization_key') sector_type = attributes.get('sector_type') utilization_kpi_capacity = attributes.get('capacity') extracted_data_key = str("extracted__utilization__%s__%s_%s__%s__%s"%(technology,machine_name,site_name,service_name,slot_number)) extracted_data = [] for each_device in devices: data_dict = dict() sector_id_suffix = "_%s_sec" % sector_type bw_suffix = "_%s_bw" % sector_type sector_id = memc_conn.get( "".join([str(each_device.get('hostname')) ,str(sector_id_suffix)])) utilization = memc_conn.get(utilization_key%(each_device.get('hostname'))) sector_bw = memc_conn.get("".join([str(each_device.get('hostname')) ,str(bw_suffix)])) if sector_bw and isinstance(sector_bw,basestring): sector_bw = literal_eval(sector_bw) if utilization and isinstance(utilization,basestring): utilization = literal_eval(utilization) data_dict['sector_id'] = sector_id data_dict['utilization'] = utilization data_dict['sector_bw'] = sector_bw extracted_data.append(data_dict) redis_conn.rpush(extracted_data_key, extracted_data) def transform_utilization_kpi(**kwargs): params = kwargs.get('params') technology = params.get('technology') devices = params.get('devices') memc_conn = params.get('memc_conn') redis_conn = params.get('redis_conn') site_name = params.get('site_name') machine_name = params.get('machine_name') attributes = params.get('attributes') service_name = attributes.get('service_name') utilization_type = attributes.get('utilization_type') utilization_key = attributes.get('utilization_key') sector_type = attributes.get('sector_type') utilization_kpi_capacity = attributes.get('capacity') severity_values = get_severity_values(service_name) critical_severity, warning_severity = severity_values transformed_data = [] transformed_data_key = str("transformed__utilization__%s__%s__%s__%s__%s"%(technology,machine_name,site_name,service_name,slot_number)) extracted_data_key = str("extracted__utilization__%s__%s__%s__%s__%s"%(technology,machine_name,site_name,service_name,slot_number)) extracted_data = redis_conn.rget(extracted_data_key) for each_dict in extracted_data: utilization = each_dict.get('utilization') sector_bw = each_dict.get('sector_bw') sector_id = each_dict.get('sector_id') perf = "" utilization_kpi = "" data_dict = {} state_string = "unknown" try: if utilization: if technology == "StarmaxIDU" and sector_bw: if sector_bw <= 3: utilization_kpi = (float(utilization)/int(utilization_kpi_capacity))*100 elif sector_bw > 3: utilization_kpi = (float(utilization)/(2*int(utilization_kpi_capacity)))*100 else: utilization_kpi = '' if technology == "CanopyPM100AP": utilization_kpi = (float(utilization)/int(utilization_kpi_capacity))*100 if isinstance(utilization_kpi,(float,int)): if technology == "StarmaxIDU": if utilization_kpi > 100: utilization_kpi = 100.00 state_string = calculate_severity(service, utilization_kpi) perf += '%s_%s_util_kpi' %(sector_type,utilization_type) + "=%s;%s;%s;%s" % (utilization_kpi, warning_severity, critical_severity, sector_id) perf = 'cam_%s_util_kpi' % util_type + "=%s;%s;%s;%s" %(cam_util,args['war'],args['crit'],sec_id) except Exception as e: perf = ';%s;%s' % (warning_severity, critical_severity) perf = 'cam_%s_util_kpi' % util_type + "=;%s;%s;%s" %(args['war'],args['crit'],sec_id) logging.error('Exception: {0}\n'.format(e)) age_of_state = age_since_last_state(each_device.get('hostname'), service, state_string, memc_conn) data_dict['host_name'] = each_device.get('hostname') data_dict['address'] = each_device.get('ip_address') data_dict['site'] = site_name data_dict['perf_data'] = perf data_dict['last_state_change'] = age_of_state data_dict['state'] = state_string data_dict['last_chk'] = time.time() data_dict['service_description'] = service data_dict['age']= age_of_state data_dict['refer'] = sector_id transformed_data.append(data_dict) redis_conn.rpush(transformed_data_key, trasformed_data) def aggregate_utilization_kpi(**kwargs): params = kwargs.get('params') technology = params.get('technology') machine_name = params.get('machine_name') redis_conn = params.get('redis_conn') transformed_data_key_pattern = str("transformed__utilization__%s__%s*"%(technology,machine_name)) aggregated_data_key = str("aggregated__utilization__%s__%s"%(technology, machine_name)) aggregated_data = [] transformed_data_keys = redis_conn.keys(transformed_data_key_pattern) for each_key in transformed_data_keys: transformed_data = redis_conn.rget(each_key) aggregated_data.extend(transformed_data) redis_conn.rpush(aggregated_data_key, aggregated_data) aggregate_utilization_kpi_dependency = dict() for each_machine_name in machine_names: aggregate_utilization_kpi_task = PythonOperator( task_id = "Aggregate_utilization_kpi_of_%s"%each_machine_name, provide_context = True, python_callable = aggregate_utilization_kpi, params = { 'technology': technology, "machine_name": each_machine_name, "redis_conn": redis_cnx }, dag=utilization_kpi_by_technology ) aggregate_utilization_kpi_dependency[each_machine_name] = aggregate_utilization_kpi_task """ insert_data_in_mysql_task = DummyOperator( task_id ="Insert_into_mysql_of_%s"%each_machine_name, dag=utilization_kpi_by_technology ) update_data_in_mysql_task = DummyOperator( task_id ="Update_into_mysql_of_%s"%each_machine_name, dag=utilization_kpi_by_technology ) update_data_in_mysql_task << aggregate_utilization_kpi_task insert_data_in_mysql_task << aggregate_utilization_kpi_task """ for each_site_name in site_names: machine_name = each_site_name.split("_")[0] devices_by_site = devices.get(each_site_name) for each_attribute in attributes: slot_number = 1 while devices_by_site: slot_of_devices = devices_by_site[:100] if slot_of_devices: extract_utilization_kpi_task = PythonOperator( task_id = "Extract_of_%s_%s_Slot_%s"%(each_site_name, each_attribute.get('service_name'), slot_number), provide_context = True, python_callable = extract_utilization_kpi, params = { 'technology': technology, "site_name": each_site_name, "machine_name": machine_name, "devices": slot_of_devices, "attributes": each_attribute, "memc_conn": memc_cnx, "redis_conn": redis_cnx, "slot_number": slot_number }, dag = utilization_kpi_by_technology ) transform_utilization_kpi_task = PythonOperator( task_id = "Transform_of_%s_%s_Slot_%s"%(each_site_name, each_attribute.get('service_name'), slot_number), provide_context = True, python_callable = transform_utilization_kpi, params = { 'technology': technology, "site_name": each_site_name, "machine_name": machine_name, "devices": slot_of_devices, "attributes": each_attribute, "memc_conn": memc_cnx, "redis_conn": redis_cnx, "slot_number": slot_number }, dag = utilization_kpi_by_technology ) extract_utilization_kpi_task >> transform_utilization_kpi_task aggregate_utilization_kpi_dependency[machine_name] << transform_utilization_kpi_task devices_by_site = devices_by_site[100:] slot_number += 1 return utilization_kpi_by_technology
names=[ 'yearID', 'franchID', 'teamID', 'W', 'L', 'percentage', 'franchName' ], encoding='utf-8') conn.insert_rows(table=table_name, rows=results.values.tolist()) return table_name dag = DAG('zylo_example', schedule_interval=timedelta(hours=1), start_date=datetime(2016, 10, 24), default_args=default_args) t1 = PythonOperator(task_id='get_zip_file', provide_context=True, python_callable=get_zip, dag=dag) t2 = PythonOperator(task_id='get_top_teams', provide_context=True, python_callable=top_teams, dag=dag) t3 = PythonOperator(task_id='load_to_MySql', provide_context=True, python_callable=bulk_load_teams, op_kwargs={'table_name': 'top_teams'}, dag=dag) t2.set_upstream(t1) t3.set_upstream(t2)
"login": "******", "password": "******" }) new_var = models.Variable() new_var.key = "sql_path" new_var.set_val("/usr/local/airflow/sql") session.add(new_var) session.commit() new_pool = models.Pool() new_pool.pool = "postgres_dwh" new_pool.slots = 10 new_pool.description = "Allows max. 10 connections to the DWH" session.add(new_pool) session.commit() session.close() dag = airflow.DAG('init_docker_example', schedule_interval="@once", default_args=args, max_active_runs=1) t1 = PythonOperator(task_id='initialize_etl_example', python_callable=initialize_etl_example, provide_context=False, dag=dag)
DEFAULT_DATE = timezone.datetime(2019, 4, 17) srcDir = os.getcwd() + '/dags/repo/examples/hello_2.11-1.0.jar' args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG('test_dag_id', default_args=args) def get_some_value(**kwargs): some_value = 10 return some_value task1 = PythonOperator(task_id='run_task_1', python_callable=get_some_value, provide_context=True, dag=dag) task2 = SparkSubmitOperator( task_id='run_sparkSubmit_job', conn_id='spark_default', java_class='hello', application=srcDir, name='airflow-spark-job', verbose=True, application_args=["{{ti.xcom_pull(task_ids='run_task_1')}}"], conf={'master': 'local'}, dag=dag, ) task1 >> task2
for countrycode, countryname in country_dict.items(): for hslevelcode in hslevel_list: for currency in currency_list: task1 = SeleniumOperator( script=get_df, script_args=[ data_folder, countrycode, hslevelcode, currency ], task_id='Extract_Data_' + countrycode + '_' + hslevelcode + '_' + currency) task2 = PythonOperator( task_id='Transform_Data_' + countrycode + '_' + hslevelcode + '_' + currency, # op_kwargs={'countrycode': countrycode}, python_callable=transform_data) task3 = PythonOperator(task_id='Load_Data_' + countrycode + '_' + hslevelcode + '_' + currency, op_kwargs={ 'countryname': countryname, 'hslevel': hslevelcode, 'currency': currency, 'countrycode': countrycode }, python_callable=load_data) # Set Dependencies task1 >> task2 >> task3
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': min_10, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('scrape_cdc', default_args=default_args) run_this = PythonOperator( task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag) t1 = BashOperator( task_id='testairflow', bash_command=f'python {file_path}', dag=dag) t1.set_downstream(run_this) if __name__ == "__main__": dag.cli()
dag=dag) # Zip Inspector and Extractor Task zip_task = ZipInspector(task_id='zip_inspector', extension_to_search='tiff', get_inputs_from=download_task.task_id, dag=dag) warp_tasks = [] addo_tasks = [] upload_tasks = [] band_paths_tasks = [] for i in range(1, 3): band_paths = PythonOperator( task_id="get_band_paths_" + str(i), python_callable=prepare_band_paths, op_kwargs={'get_inputs_from': zip_task.task_id}, dag=dag) band_paths_tasks.append(band_paths) warp = GDALWarpOperator(task_id='gdalwarp_' + str(i), target_srs=TARGET_SRS, tile_size=TILE_SIZE, overwrite=OVERWRITE, dstdir=S1GRD1SDV.process_dir, get_inputs_from=band_paths.task_id, dag=dag) warp_tasks.append(warp) addo = GDALAddoOperator(trigger_rule=TriggerRule.ALL_SUCCESS, resampling_method=RESAMPLING_METHOD,
from datetime import datetime, timedelta import os from airflow import DAG from airflow.operators import PythonOperator from airflow.hooks.postgres_hook import PostgresHook def create_table(**context): redhift_hook = PostgresHook('redshift') queries = open('/home/workspace/airflow/create_tables_capstone.sql', 'r').read() redhift_hook.run(queries) return default_args = {'owner': 'Stan Taov'} # Create DAG with previously provided default_args dag = DAG('dag_tables_capstone', default_args=default_args, description='Create tables in Redshift', start_date=datetime.now(), schedule_interval='@once') # This operator calls create_table function to create tables create_tables_in_redshift = PythonOperator(task_id='Create_tables_in_redshift', dag=dag, provide_context=True, python_callable=create_table)
def execute(self, context): log.info('--------------------S1Metadata_PLUGIN running------------') task_instance = context['task_instance'] log.info("Receiving from 'get_input_from':\n{}".format( self.get_inputs_from)) download_task_id = self.get_inputs_from['download_task_id'] addo_task_ids = self.get_inputs_from['addo_task_ids'] upload_task_ids = self.get_inputs_from['upload_task_ids'] archive_product_task_id = self.get_inputs_from[ 'archive_product_task_id'] downloaded = context['task_instance'].xcom_pull( task_ids=download_task_id, key=XCOM_RETURN_KEY) local_granules_paths = [] for tid in addo_task_ids: local_granules_path = context['task_instance'].xcom_pull( task_ids=tid, key=XCOM_RETURN_KEY) if local_granules_path: local_granules_paths += local_granules_path uploaded_granules_paths = context['task_instance'].xcom_pull( task_ids=upload_task_ids, key=XCOM_RETURN_KEY) original_package_path = context['task_instance'].xcom_pull( task_ids=archive_product_task_id, key=XCOM_RETURN_KEY) granules_dict, bbox = collect_granules_metadata( local_granules_paths, self.granules_upload_dir, self.bands_dict) if not downloaded: log.info("No products from Download task, Nothing to do.") return list() if not local_granules_paths: log.info("No local granules from processing, Nothing to do.") return list() if not uploaded_granules_paths: log.info("No uploaded granules from upload task, Nothing to do.") return list() if not original_package_path: log.info( "No original package path from original package upload task, Nothing to do." ) return list() safe_package_path = downloaded.keys()[0] safe_package_filename = os.path.basename(safe_package_path) product_id = downloaded[safe_package_path].get('title') originalPackageLocation = self.original_package_download_base_url + safe_package_filename processing_dir = os.path.join(self.processing_dir, product_id) if not os.path.exists(processing_dir): os.makedirs(processing_dir) log.info('safe_package_path: {}'.format(safe_package_path)) log.info('local_granules_paths: {}'.format(local_granules_paths)) s1reader = S1GDALReader(safe_package_path) product_metadata = s1reader.get_metadata() product_metadata['footprint'] = s1reader.get_footprint() log.info(pprint.pformat(product_metadata, indent=4)) timeStart = product_metadata['ACQUISITION_START_TIME'] timeEnd = product_metadata['ACQUISITION_STOP_TIME'] owslinks_dict = create_owslinks_dict( product_identifier=product_id, timestart=timeStart, timeend=timeEnd, granule_bbox=bbox, gs_workspace=self.gs_workspace, gs_wms_layer=self.gs_wms_layer, gs_wms_width=self.gs_wms_width, gs_wms_height=self.gs_wms_height, gs_wms_format=self.gs_wms_format, gs_wms_version=self.gs_wms_version, gs_wfs_featuretype=self.gs_wfs_featuretype, gs_wfs_format=self.gs_wfs_format, gs_wfs_version=self.gs_wfs_version, gs_wcs_coverage_id=self.gs_wcs_coverage_id, gs_wcs_scale_i=self.gs_wcs_scale_i, gs_wcs_scale_j=self.gs_wcs_scale_j, gs_wcs_format=self.gs_wcs_format, gs_wcs_version=self.gs_wcs_version) # create thumbnail # TODO: create proper thumbnail from quicklook. Also remove temp file log.info("Creating thumbnail") thumbnail_path = os.path.join(processing_dir, "thumbnail.png") quicklook_path = s1reader.get_quicklook() log.info(pprint.pformat(quicklook_path)) copyfile(quicklook_path, thumbnail_path) search_params_dict = create_search_dict(product_metadata, originalPackageLocation) log.info(pprint.pformat(search_params_dict)) metadata_dict = create_metadata_dict(product_metadata) log.info(pprint.pformat(metadata_dict)) description_dict = create_description_dict(product_metadata, originalPackageLocation) log.info(pprint.pformat(description_dict)) # create description.html and dump it to file log.info("Creating description.html") html_description = create_product_description(description_dict) search_params_dict['htmlDescription'] = html_description # create metadata XML log.info("Creating metadata.xml") metadata_xml = create_product_metadata(metadata_dict) po = PythonOperator(task_id="s1_metadata_dictionary_creation", python_callable=create_procuct_zip, op_kwargs={ 'processing_dir': processing_dir, 'search_params_dict': search_params_dict, 'description_html': html_description, 'metadata_xml': metadata_xml, 'granules_dict': granules_dict, 'owslinks_dict': owslinks_dict, 'thumbnail_path': thumbnail_path }) out = po.execute(context) zip_paths = list() if out: zip_paths.append(out) return zip_paths
lines = text.split("\n") return tuple("('" + x.split(",")[0] + "','" + x.split(",")[1] + "')" for x in lines[1:] if x != "") # index 1부터 시작하게 하여 header 무시, 빈 문자열 제거 def load(**kwargs): cur = get_Redshift_connection() lines = kwargs["ti"].xcom_pull(key="return_value", task_ids="perform_transform") sql = """BEGIN; TRUNCATE TABLE ysjune1051.name_gender; INSERT INTO ysjune1051.name_gender VALUES {lines}; END;""".format(lines=",".join(lines)) cur.execute(sql) task_extract = PythonOperator( task_id = "perform_extract", python_callable = extract, op_kwargs={"url": "https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv"}, provide_context=True, dag = dag_second_assignment ) task_transform = PythonOperator( task_id = "perform_transform", python_callable = transform, provide_context=True, dag = dag_second_assignment ) task_load = PythonOperator( task_id = "perform_load", python_callable = load, provide_context=True,
] time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) with open( "/home/tmadmin/NITIN/VIPUL/logs/provis/comparision-provis-%s.csv" % (time_str), "wb") as out: wr = csv.writer(out) for datalist in all_data_sv: wr.writerow(datalist) with open("/home/tmadmin/NITIN/VIPUL/summary-provis.csv", "ab") as out: wr2 = csv.writer(out) wr2.writerow(summary) test_ul_issue_kpi = PythonOperator(task_id="test_ul_issue", provide_context=True, python_callable=test_ul_issue_kpi, dag=test_ul_issue_dag, queue=Q_PUBLIC) test_provis_kpi = PythonOperator(task_id="test_provision_kpi", provide_context=True, python_callable=test_provis_kpi, dag=test_ul_issue_dag, queue=Q_PUBLIC) ULISSUE = ExternalTaskSensor( external_dag_id="UL_ISSUE_KPI.StarmaxIDU", external_task_id="aggregate_ul_issue_bs_ospf1", task_id="sense_ospf1_ul_issue", poke_interval=20, trigger_rule='all_done',
'retries': 1, 'retry_delay': timedelta(minutes=15), 'scheduler_interval': '@daily' # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('racer-nightly-to-s3', default_args=default_args) def download_google_sheet(ds, **kwargs): '''This is a function that will run within the DAG execution''' print("running google sheet") df = pd.read_csv( 'https://docs.google.com/spreadsheets/d/1vYg2lvDL9Q4ddzUgKVTvTyCUcF2EgnNsxNwffHo28lo/export?gid=0&format=csv' ) execution_date = kwargs['execution_date'].date() df.to_csv('~/shelters-' + str(execution_date) + '.csv') return df run_this = PythonOperator(task_id='download_google_sheet', provide_context=True, python_callable=download_google_sheet, dag=dag)
schedule_interval=None) def my_sleeping_function(random_base): '''This is a function that will run within the DAG execution''' time.sleep(random_base) def print_context(ds, **kwargs): pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' run_this = PythonOperator(task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator(task_id='sleep_for_' + str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i) / 10}, dag=dag) task.set_upstream(run_this)
from airflow.operators import DummyOperator, PythonOperator from datetime import datetime, timedelta import airflow.hooks.S3_hook from helper import t1, t2, t3, t4 #from helper import upload_file_to_S3_with_hook default_args = { 'owner': 'airflow', 'start_date': datetime(2020, 9, 24), 'retry_delay': timedelta(seconds=5) } # Using the context manager alllows you not to duplicate the dag parameter in each operator with DAG('S3_dag_test', default_args=default_args, schedule_interval='@once') as dag: t1 = PythonOperator(task_id="data_collection", python_callable=t1, dag=dag) t2 = PythonOperator(task_id="downlaod_from_s3", python_callable=t2, dag=dag) t3 = PythonOperator(task_id="transform", python_callable=t3, dag=dag) t4 = PythonOperator(task_id="upload_to_salesforce1", python_callable=t4, dag=dag) t1 >> t2 >> t3 >> t4 #t4 '''
from airflow import DAG from airflow.operators import BashOperator, PythonOperator from datetime import datetime, timedelta from load_stock_data import load_koersen_from_ASN # Following are defaults which can be overridden later on default_args = { 'owner': 'mattooren', 'depends_on_past': False, 'start_date': datetime(2019, 12, 23), 'schedule_interval': '0 18 * * *', 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('ASN_Koersen', default_args=default_args) # t1, t2, t3 and t4 are examples of tasks created using operators t5 = PythonOperator(task_id='koersen_loader', provide_context=True, python_callable=load_koersen_from_ASN, dag=dag)
logging.error("Succeessfully Updated Age") else: logging.info("recieved f*****g None") except Exception: logging.error( "Unable to actually insert te updated data into redis") except Exception: logging.info("Unable to get latest refer") traceback.print_exc() initiate_dag = PythonOperator(task_id="Initiate", provide_context=False, python_callable=init_kpi, dag=ul_issue_dag, queue=Q_PUBLIC) for machine in machines: for site in machine.get('sites'): site_name = site.get('name') all_sites.append(site_name) #site = eval(site) #print site.get('name') for technology in technologies: CHILD_DAG_NAME = "%s" % (technology) #logging.info("For Tech %s"%(technology)) ul_subdag_task = SubDagOperator(subdag=process_ul_issue_kpi( PARENT_DAG_NAME, CHILD_DAG_NAME, datetime(2017, 2, 24),
'product/product.json', 'product/granules.json', 'product/thumbnail.jpeg', 'product/owsLinks.json' ] product_zip_task = Sentinel2ProductZipOperator( task_id='create_product_zip_task', target_dir=S2MSIL1C.download_dir, generated_files=generated_files_list, placeholders=placeholders_list, get_inputs_from=download_task.task_id, dag=dag) # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products" publish_task = PythonOperator(task_id="publish_product_task", python_callable=publish_product, op_kwargs={ 'geoserver_username': CFG.geoserver_username, 'geoserver_password': CFG.geoserver_password, 'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format( CFG.geoserver_rest_url, S2MSIL1C.geoserver_oseo_collection), 'get_inputs_from': product_zip_task.task_id, }, dag=dag) search_task >> download_task >> archive_task >> thumbnail_task >> metadata_task >> archive_wldprj_task >> product_zip_task >> publish_task
def load(lines): logging.info("load started") cur = get_Redshift_connection() sql = "BEGIN;DELETE FROM TABLE raw_data.name_gender;" for l in lines: if l != '': (name, gender) = l.split(",") sql += "INSERT INTO raw_data.name_gender VALUES ('{name}', '{gender}');" sql += "END;" cur.execute(sql) logging.info(sql) logging.info("load done") def etl(): link = "https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv" data = extract(link) lines = transform(data) load(lines) dag_second_assignment = DAG( dag_id='second_assignment', start_date=datetime(2020, 8, 10), # 날짜가 미래인 경우 실행이 안됨 schedule_interval='0 2 * * *') # 적당히 조절 task = PythonOperator(task_id='perform_etl', python_callable=etl, dag=dag_second_assignment)
from airflow import DAG from airflow.operators import PythonOperator, DummyOperator from datetime import timedelta, datetime from get_sensor_data import get_sensor_data default_args = { 'owner': 'ubuntu', 'depends_on_past': False, 'start_date': datetime(2020, 9, 28), 'retries': 3, 'retry_delay': timedelta(seconds=30) } with DAG(dag_id='purpleair_sensors', description="Request PurpleAir sensor data", default_args=default_args, schedule_interval=timedelta(minutes=5), catchup=False) as dag: t1 = DummyOperator(task_id='dummy_task') t2 = PythonOperator(task_id='purpleair_api', python_callable=get_sensor_data) t1 >> t2
:param List[Dict[str, object]] test_scenarios: [Required] ID for the task :return: str """ json_encoded = json.dumps(test_scenarios) utf_encoded = json_encoded.encode("utf-8") b64_encoded = base64.b64encode(utf_encoded).decode("utf-8") return b64_encoded with models.DAG( "burnham", schedule_interval="@daily", default_args=DEFAULT_ARGS, ) as dag: # Generate a UUID for this test run generate_burnham_test_run_uuid = PythonOperator( task_id="generate_burnham_test_run_uuid", python_callable=lambda: str(uuid.uuid4()), ) burnham_test_run = '{{ task_instance.xcom_pull("generate_burnham_test_run_uuid") }}' # This Airflow macro is added to sensors to filter out rows by submission_timestamp start_timestamp = "{{ dag_run.start_date.isoformat() }}" # Create burnham clients that complete missions and submit pings client1 = burnham_run( task_id="client1", burnham_test_run=burnham_test_run, burnham_test_name=DEFAULT_TEST_NAME, burnham_missions=["MISSION G: FIVE WARPS, FOUR JUMPS", "MISSION C: ONE JUMP"], burnham_spore_drive="tardigrade", owner=DAG_OWNER, email=DAG_EMAIL,
DAG_NAME = 'External_Task_Sensor_Test' args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(10)} dag = DAG( dag_id=DAG_NAME, catchup=False, default_args=args, schedule_interval='25 10 * * *', ) def dummy_call(**kwargs): return "Nothing to do.." start_task = PythonOperator(task_id='start_task', python_callable=dummy_call, provide_context=True, dag=dag) external_task_dependent = ExternalTaskSensor(task_id='dependent_on_dag_2_task', external_dag_id='XCOM_TEST', external_task_id='run_file_2', dag=dag) end_task = PythonOperator(task_id='end_task', python_callable=dummy_call, provide_context=True, dag=dag) start_task >> external_task_dependent >> end_task
from application.scheduled.prune import remove_stale_artifacts from airflow import DAG from airflow.operators import PythonOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), 'start_date': datetime(2017, 7, 1) } dag = DAG('remove_stale_artifacts', default_args=default_args) remove_stale_artifacts_task = PythonOperator( task_id='remove_stale_artifacts_task', provide_context=False, python_callable=remove_stale_artifacts, dag=dag)
args.append("--junit_path=" + junit_path) args.append("--project=" + GCB_PROJECT) # We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True, dryrun=dryrun) return run_py_checks def done(**_kwargs): logging.info("Executing done step.") clone_op = PythonOperator(task_id='clone_repo', provide_context=True, python_callable=clone_repo, dag=dag) build_op = PythonOperator(task_id='build_images', provide_context=True, python_callable=build_images, dag=dag) build_op.set_upstream(clone_op) py_lint_op = PythonOperator(task_id='pylint', provide_context=True, python_callable=py_checks_gen("lint"), dag=dag) py_lint_op.set_upstream(clone_op) py_test_op = PythonOperator(task_id='pytest',
f2 = df['spent'].values X = np.array(list(zip(f1, f2))) kmeans = KMeans(n_clusters=5).fit(X) log.info(kmeans.cluster_centers_) dag = DAG('my_sql_test_dag', description='SQL tutorial DAG', schedule_interval='0 12 * * *', start_date=datetime(2018, 3, 20), catchup=False) simple_select_mysql_task = \ PythonOperator(task_id='simple_select_mysql', provide_context=True, python_callable=simple_select_mysql, dag=dag) simple_elbow_data_task = PythonOperator(task_id='simple_elbow_data_task', provide_context=True, python_callable=simple_elbow_data, dag=dag) simple_kmeans_data_task = PythonOperator(task_id='simple_kmeans_data_task', provide_context=True, python_callable=simple_kmeans_data, dag=dag) simple_select_mysql_task >> simple_elbow_data_task simple_select_mysql_task >> simple_kmeans_data_task
#Delete current rows in target table table snql_hook.run("""DELETE FROM dim_sneakers""") #Insert new rows into target table snql_hook.insert_rows('dim_sneakers', staging_results) def delete_from_staging(): staging_hook = PostgresHook('snql_staging') staging_hook.run('DELETE FROM dim_sneakers;') create_staging_table = PythonOperator(task_id='create_staging_table', python_callable=create_staging, email_on_failure=True, email='*****@*****.**', dag=dag) pull_and_insert_to_staging = SQLTemplatedPythonOperator( task_id='pull_and_insert_to_staging', templates_dict={'query': 'dim_sneakers/extract.sql'}, python_callable=pull_and_insert, params={'ds': datetime.utcnow()}, email_on_failure=True, email='*****@*****.**', provide_context=True, dag=dag) create_target_table = PythonOperator(task_id='create_target_table', python_callable=create_target,