示例#1
0
    ,catchup = False
)

start_task = DummyOperator(
        task_id='start_task',
        dag=main_dag
)

# Defining task to fetch cdc min dates and datefield references and download them to local instance
cdc_times_to_local_instance = PythonOperator(
    task_id = 'cdc_times_to_local_instance',
    python_callable = functs.cdc_times_to_local_instance,
    op_kwargs={
    'cdc_times_and_dates_folder_local_location': cdc_times_and_dates_folder_local_location,
    'snowflake_username': snowflake_username,
    'snowflake_password': snowflake_password,
    'snowflake_account': snowflake_account,
    'snowflake_database': snowflake_database,
    'snowflake_stage_schema': snowflake_stage_schema
    },
    dag = main_dag
)
        
# Creates the tasks dynamically.  Each one will elaborate one chunk of data.
def create_dynamic_task_tos3(database_name, table):
    return PythonOperator(
        task_id='upload_to_S3_task_' + database_name + '_' + table,
        python_callable=functs.upload_table_to_S3_with_hook_v2,
        pool = 'massive_pool',
        op_kwargs={		   
        'Source_System_Name': Source_System_Name,
示例#2
0
# DAG-level settings.
with DAG(dag_id=dag_name,
         schedule_interval='@daily',
         start_date=datetime.strptime('2020-04-07 00:00:00',
                                      "%Y-%m-%d %H:%M:%S"),
         max_active_runs=1,
         concurrency=1,
         default_args={
             'project_id': 'silicon-parity-282607',
             'email': '*****@*****.**',
             'email_on_failure': True,
             'email_on_retry': False
         }) as dag:

    push_cluster_name = PythonOperator(dag=dag,
                                       task_id="push-cluster-name",
                                       provide_context=True,
                                       python_callable=push_cluster_name)

    # The task of creating a cluster.
    dataproc_create_cluster = DataprocClusterCreateOperator(
        task_id='dataproc-create-cluster',
        project_id='silicon-parity-282607',
        region='us-central1',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        num_workers=2)

    # The task of running the Spark job.
    dataproc_spark_process = DataProcSparkOperator(
示例#3
0
        """
        Checks whether Zip is installed.
        :return: True if it is installed, False if not.
        :rtype: bool
        """
        return_code = os.system("zip")
        if return_code != 0:
            raise SystemError("The zip binary is missing")
    
    def hello():
        print("Hello")

    # You don't have to use any special KubernetesExecutor configuration if you don't want to
    start_task = PythonOperator(
        task_id="start_task",
        python_callable=hello,
        executor_config={"KubernetesExecutor": {"image": "apache/airflow:master-ci" }}
    )

    # But you can if you want to
    one_task = PythonOperator(
        task_id="one_task",
        python_callable=hello,
        executor_config={"KubernetesExecutor": {"image": "apache/airflow:master-ci" }}
    )

    # Use the zip binary, which is only found in this special docker image
    # two_task = PythonOperator(
    #     task_id="two_task",
    #     python_callable=use_zip_binary,
    #     executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}}
示例#4
0
    'start_date': datetime(2019, 1, 1),
    'retry_delay': timedelta(minutes=5)
}

# Using the context manager alllows you not to duplicate the dag parameter in each operator
dag = DAG('stock_historic_etl_dag',
          default_args=default_args,
          schedule_interval='@once')

start_task = DummyOperator(task_id='dummy_start', dag=dag)

upload_historic_news_to_S3_task = PythonOperator(
    task_id='upload_historic_news_to_S3',
    python_callable=upload_file_to_S3_with_hook,
    op_kwargs={
        'filename': '/root/airflow/dags/download/stocknews.zip',
        'key': 'raw-historic-data/stocknews.zip',
        'bucket_name': 'stock.etl',
    },
    dag=dag)

upload_historic_pricing_to_S3_task = PythonOperator(
    task_id='upload_historic_pricing_to_S3',
    python_callable=upload_file_to_S3_with_hook,
    op_kwargs={
        'filename':
        '/root/airflow/dags/download/price-volume-data-for-all-us-stocks-etfs.zip',
        'key': 'raw-historic-data/stockpricing.zip',
        'bucket_name': 'stock.etl',
    },
    dag=dag)
示例#5
0
def etl():
    insert_to_db_current_date()

# Following are defaults which can be overridden later on
default_args = {
    'owner': 'kaidokariste',
    'depends_on_past': False,
    'start_date': datetime(2021, 1, 10),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

# Define the DAG so it runs on a daily basis, after every 5 minutes
dag = DAG(dag_id="home_pc_insert",
          default_args=default_args,
          schedule_interval="*/5 * * * *")

# Make sure `etl()` is called in the operator. Pass the correct kwargs.
task_recommendations = PythonOperator(
    task_id="test_insert_task",
    python_callable=etl,
    dag=dag
    #op_kwargs={"db_engines": db_engines},
)



示例#6
0
def process_utilization_kpi( 	parent_dag_name, 
				child_dag_name, 
				start_date, 
				schedule_interval,
				celery_queue, 
				technology, 
				devices, 
				attributes
				):

    	site_names = devices.keys()
    	machine_names = list(set([each_site.split('_')[0] for each_site in devices.keys()]))

    	#logging.info("Site names by utilization subdag(%s): %s\n"%(technology,site_names))

    	utilization_kpi_by_technology = DAG(
    		dag_id="%s.%s" % (parent_dag_name, child_dag_name),
    		schedule_interval=schedule_interval,
    		start_date=start_date,
  		)

	def evaluate_condition(rules,current_value):
		result =  'False'
		result_all = []
		
		for i in range(1,len(rules),2):
			threshold_value = rules[i].get('value') #get threshold from rules dict
			operator = rules[i].get('operator') #get operator from rules
			service_name = rules[i].get('name')
			symbol = operators.get(operator) #get symbol from dict
			if threshold_value != '' and current_value != '':
				#logging.info("\n Evaluating ")
				
				#logging.info("Evaluating the Formula ---> %s%s%s of %s as %s"%(str(current_value),str(symbol),str(threshold_value) , str(service_name) ,eval("%s%s%s"%(float(current_value),str(symbol),float(threshold_value)))))
				try:
					if eval("%s%s%s"%(float(current_value),str(symbol),float(threshold_value))):
						result_all.append('True')
					else:
						result_all.append('False')
				except (NameError, SyntaxError, TypeError, ValueError):
					
					if eval('\''+str(current_value)+'\''+symbol+'\''+str(threshold_value)+'\''):
						result_all.append('True')
					else:
						result_all.append('False')
				except Exception:
					logging.info("Some WTF Exception")
					if eval('\''+str(current_value)+'\''+symbol+'\''+str(threshold_value)+'\''):
						result_all.append('True')
					else:
						result_all.append('False')
			else:
				result_all.append('False')

			try:
				#logging.info(rules)
				#logging.info("i="+str(i))
				if rules[i+1] == 'AND' or rules[i+1] == 'OR' and rules[i+1] != None:
					result_all.append(rules[i+1].lower())
				
			except IndexError:
					#logging.info('No Conjugator or the rule ended')
					continue
		#logging.info("The Result of %s After compiling booleans ---> %s"%(str(service_name),str(result_all)))
		if len(result_all) == 1:
			result = eval(result_all[0])
		elif len(result_all) % 2 != 0:
			result = eval(" ".join(result_all))

		else:

			logging.info("Please Check the syntax of rules")

		#logging.info("returning ; %s"%str(result))
		return result

	def calculate_severity(service,cur,host_state="",ds=""):

		final_severity = []
		global rules 
		
		if not (ds == "pl" and host_state == "down"):
			#TODO: Currently using loop to get teh Dic value can get hashed value provided the total severity for the devices remain fixed need to consult
			try:
		
				total_severities = rules.get(service) #TODO: Handle if service not found
				total_severities_len = len(total_severities)
			#Severity 1 will be the first to checked and should be the top priority i.e if 
			except TypeError:
				#logging.info("The specified service "+service+" does not have a rule specified in rules variable")
				return 'unknown'
			
			for i in range(1,total_severities_len+1):
					current_severity = ""
					sv_rules = total_severities.get("Severity"+str(i))
					
					if sv_rules[0]:
						current_severity = sv_rules[0]
					else:
						current_severity = 'unknown'
						logging.warning("Please provide severity name for " + str(service))

					result = evaluate_condition(sv_rules,cur)
					if result:	
						return current_severity
						#final_severity =  final_severity.append(evaluate_condition(rules,cur)) #Later can be used to get all the SEV and then based upon priority decide Severity
						#logging.info("The Final Result for Service "+service+" is " + str(result) +" having Val "+ str(cur) +" and Severity : "+ str(current_severity))
						
					else:
						continue
		elif host_state=="down" and ds == "pl":
			return host_state
		else:
			return "up"

		if (ds == "pl" or ds == "rta"):
			return 'up'
		else:
			return "ok"
		#only required for UP and Down servereties of network devices

	def age_since_last_state(host, service, state, memc_conn):

		prefix = 'util:'
		key = prefix + host + ':' + service
		out = memc_conn.get(str(key))
		set_key = True
		timestamp = datetime.now() 
		now = (timestamp + timedelta(minutes=-(timestamp.minute % 5))).replace(second=0, microsecond=0)
		now = now.strftime('%s')
		age = now
		value = state + ',' + age
		if out:
			out = out.split(',')
			old_state = out[0]
			time_since = out[1]
			if old_state == state:
				set_key = False
				age = time_since
		if set_key:
			memc_conn.set(str(key), value)
			return int(age)

	def get_severity_values(service):
		global rules
		all_sev = rules.get(service)
		sev_values = []
		for i in range(1,len(all_sev)+1):
			sev_values.append(all_sev.get("Severity"+str(i))[1].get("value"))
		return sev_values

    	def extract_utilization_kpi(**kwargs):

		params = kwargs.get('params')

		technology = params.get('technology')
		devices = params.get('devices')
		memc_conn = params.get('memc_conn')
		redis_conn = params.get('redis_conn')
    		site_name = params.get('site_name')
    		machine_name = params.get('machine_name')
    		attributes = params.get('attributes')
    		slot_number = params.get('slot_number')

		service_name = attributes.get('service_name')
    		utilization_type = attributes.get('utilization_type')
    		utilization_key = attributes.get('utilization_key')
    		sector_type = attributes.get('sector_type')
		utilization_kpi_capacity = attributes.get('capacity')

		extracted_data_key = str("extracted__utilization__%s__%s_%s__%s__%s"%(technology,machine_name,site_name,service_name,slot_number))
		extracted_data = []

		for each_device in devices:

			data_dict = dict()
			sector_id_suffix = "_%s_sec" % sector_type
	    		bw_suffix = "_%s_bw" % sector_type

            		sector_id = memc_conn.get( "".join([str(each_device.get('hostname')) ,str(sector_id_suffix)]))
            		utilization = memc_conn.get(utilization_key%(each_device.get('hostname')))
            		sector_bw = memc_conn.get("".join([str(each_device.get('hostname')) ,str(bw_suffix)]))

            		if sector_bw and isinstance(sector_bw,basestring):
       	        		sector_bw = literal_eval(sector_bw)
       	    		if utilization and isinstance(utilization,basestring):
       	        		utilization = literal_eval(utilization)

       	        	data_dict['sector_id'] = sector_id
       	        	data_dict['utilization'] = utilization
       	        	data_dict['sector_bw'] = sector_bw

       	        	extracted_data.append(data_dict)

		redis_conn.rpush(extracted_data_key, extracted_data)

    	def transform_utilization_kpi(**kwargs):

		params = kwargs.get('params')

		technology = params.get('technology')
		devices = params.get('devices')
		memc_conn = params.get('memc_conn')
		redis_conn = params.get('redis_conn')
    		site_name = params.get('site_name')
    		machine_name = params.get('machine_name')
    		attributes = params.get('attributes')

		service_name = attributes.get('service_name')
    		utilization_type = attributes.get('utilization_type')
    		utilization_key = attributes.get('utilization_key')
    		sector_type = attributes.get('sector_type')
		utilization_kpi_capacity = attributes.get('capacity')

		severity_values = get_severity_values(service_name)
		critical_severity, warning_severity = severity_values

		transformed_data = []

		transformed_data_key = str("transformed__utilization__%s__%s__%s__%s__%s"%(technology,machine_name,site_name,service_name,slot_number))
		extracted_data_key = str("extracted__utilization__%s__%s__%s__%s__%s"%(technology,machine_name,site_name,service_name,slot_number))
		extracted_data = redis_conn.rget(extracted_data_key)

		for each_dict in extracted_data:

			utilization = each_dict.get('utilization')
			sector_bw = each_dict.get('sector_bw')
			sector_id = each_dict.get('sector_id')

			perf = ""
			utilization_kpi = ""			
			data_dict = {}
			state_string = "unknown"

			try:
	    			if utilization:
	    				if technology == "StarmaxIDU" and sector_bw:

		    				if sector_bw <= 3:
		    					utilization_kpi = (float(utilization)/int(utilization_kpi_capacity))*100
		    				elif sector_bw  > 3:
		    					utilization_kpi = (float(utilization)/(2*int(utilization_kpi_capacity)))*100
		    				else:
		    					utilization_kpi = ''

		    			if technology == "CanopyPM100AP":

		    				utilization_kpi = (float(utilization)/int(utilization_kpi_capacity))*100
		
	    			if isinstance(utilization_kpi,(float,int)):

					if technology == "StarmaxIDU":
		    				if utilization_kpi > 100:
		    					utilization_kpi = 100.00

		    			state_string = calculate_severity(service, utilization_kpi)


				perf += '%s_%s_util_kpi' %(sector_type,utilization_type) + "=%s;%s;%s;%s" % (utilization_kpi, warning_severity, critical_severity, sector_id)
				perf = 'cam_%s_util_kpi' % util_type + "=%s;%s;%s;%s" %(cam_util,args['war'],args['crit'],sec_id)

	    		except Exception as e:
	        		perf = ';%s;%s' % (warning_severity, critical_severity)
	        		perf = 'cam_%s_util_kpi' % util_type + "=;%s;%s;%s" %(args['war'],args['crit'],sec_id)

	        		logging.error('Exception: {0}\n'.format(e))

	    		age_of_state = age_since_last_state(each_device.get('hostname'), service, state_string, memc_conn)

			data_dict['host_name'] = each_device.get('hostname')
			data_dict['address'] = each_device.get('ip_address')
			data_dict['site'] = site_name
			data_dict['perf_data'] = perf
			data_dict['last_state_change'] = age_of_state
			data_dict['state']  = state_string
			data_dict['last_chk'] = time.time()
			data_dict['service_description'] = service
			data_dict['age']= age_of_state
	    		data_dict['refer'] = sector_id

	    		transformed_data.append(data_dict)

		redis_conn.rpush(transformed_data_key, trasformed_data)

    	def aggregate_utilization_kpi(**kwargs):

		params = kwargs.get('params')

		technology = params.get('technology')
		machine_name = params.get('machine_name')
		redis_conn = params.get('redis_conn')

		transformed_data_key_pattern = str("transformed__utilization__%s__%s*"%(technology,machine_name)) 
		aggregated_data_key = str("aggregated__utilization__%s__%s"%(technology, machine_name))
		aggregated_data = []

		transformed_data_keys = redis_conn.keys(transformed_data_key_pattern)
		for each_key in transformed_data_keys:
			transformed_data = redis_conn.rget(each_key)
			aggregated_data.extend(transformed_data)

		redis_conn.rpush(aggregated_data_key, aggregated_data)
    	
    	aggregate_utilization_kpi_dependency = dict()
    	for each_machine_name in machine_names:
	
		aggregate_utilization_kpi_task = PythonOperator(
		    	task_id = "Aggregate_utilization_kpi_of_%s"%each_machine_name,
		    	provide_context = True,
		    	python_callable = aggregate_utilization_kpi,
		    	params = {
		    		'technology': technology,
		    		"machine_name": each_machine_name,
		    		"redis_conn": redis_cnx
		    		},
			dag=utilization_kpi_by_technology
			)

		aggregate_utilization_kpi_dependency[each_machine_name] = aggregate_utilization_kpi_task

		"""
		insert_data_in_mysql_task = DummyOperator(
			task_id ="Insert_into_mysql_of_%s"%each_machine_name,
			dag=utilization_kpi_by_technology
			)

		update_data_in_mysql_task = DummyOperator(
			task_id ="Update_into_mysql_of_%s"%each_machine_name,
			dag=utilization_kpi_by_technology
			)

		update_data_in_mysql_task << aggregate_utilization_kpi_task
		insert_data_in_mysql_task << aggregate_utilization_kpi_task
   		"""

    	for each_site_name in site_names:
		machine_name = each_site_name.split("_")[0]
    		devices_by_site = devices.get(each_site_name)

    		for each_attribute in attributes:
    	    		slot_number = 1    	

    	   		while devices_by_site:
	        		slot_of_devices = devices_by_site[:100]

				if slot_of_devices:
    	            			extract_utilization_kpi_task = PythonOperator(
						task_id = "Extract_of_%s_%s_Slot_%s"%(each_site_name, each_attribute.get('service_name'), slot_number),
						provide_context = True,
						python_callable = extract_utilization_kpi,
						params = { 
							'technology': technology,
							"site_name": each_site_name,
				   			"machine_name": machine_name,
				   			"devices": slot_of_devices,
				   			"attributes": each_attribute,
				   			"memc_conn": memc_cnx,
				   			"redis_conn": redis_cnx,
							"slot_number": slot_number
				 		},
						dag = utilization_kpi_by_technology
					)
		    			
 	    	    			transform_utilization_kpi_task = PythonOperator(
  	    					task_id = "Transform_of_%s_%s_Slot_%s"%(each_site_name, each_attribute.get('service_name'), slot_number),
						provide_context = True,
						python_callable = transform_utilization_kpi,
						params = { 
							'technology': technology,
							"site_name": each_site_name,
							"machine_name": machine_name,
				    			"devices": slot_of_devices,
				    			"attributes": each_attribute,
				    			"memc_conn": memc_cnx,
				    			"redis_conn": redis_cnx,
				    			"slot_number": slot_number
				 		},
						dag = utilization_kpi_by_technology
	    				)
		    
                    			extract_utilization_kpi_task >> transform_utilization_kpi_task
		    
	            			aggregate_utilization_kpi_dependency[machine_name] << transform_utilization_kpi_task
	            			
	        		devices_by_site = devices_by_site[100:]
	        		slot_number += 1

    	return utilization_kpi_by_technology
示例#7
0
                              names=[
                                  'yearID', 'franchID', 'teamID', 'W', 'L',
                                  'percentage', 'franchName'
                              ],
                              encoding='utf-8')
    conn.insert_rows(table=table_name, rows=results.values.tolist())
    return table_name


dag = DAG('zylo_example',
          schedule_interval=timedelta(hours=1),
          start_date=datetime(2016, 10, 24),
          default_args=default_args)

t1 = PythonOperator(task_id='get_zip_file',
                    provide_context=True,
                    python_callable=get_zip,
                    dag=dag)

t2 = PythonOperator(task_id='get_top_teams',
                    provide_context=True,
                    python_callable=top_teams,
                    dag=dag)

t3 = PythonOperator(task_id='load_to_MySql',
                    provide_context=True,
                    python_callable=bulk_load_teams,
                    op_kwargs={'table_name': 'top_teams'},
                    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t2)
            "login": "******",
            "password": "******"
        })

    new_var = models.Variable()
    new_var.key = "sql_path"
    new_var.set_val("/usr/local/airflow/sql")
    session.add(new_var)
    session.commit()

    new_pool = models.Pool()
    new_pool.pool = "postgres_dwh"
    new_pool.slots = 10
    new_pool.description = "Allows max. 10 connections to the DWH"

    session.add(new_pool)
    session.commit()

    session.close()


dag = airflow.DAG('init_docker_example',
                  schedule_interval="@once",
                  default_args=args,
                  max_active_runs=1)

t1 = PythonOperator(task_id='initialize_etl_example',
                    python_callable=initialize_etl_example,
                    provide_context=False,
                    dag=dag)
DEFAULT_DATE = timezone.datetime(2019, 4, 17)

srcDir = os.getcwd() + '/dags/repo/examples/hello_2.11-1.0.jar'

args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
dag = DAG('test_dag_id', default_args=args)


def get_some_value(**kwargs):
    some_value = 10
    return some_value


task1 = PythonOperator(task_id='run_task_1',
                       python_callable=get_some_value,
                       provide_context=True,
                       dag=dag)

task2 = SparkSubmitOperator(
    task_id='run_sparkSubmit_job',
    conn_id='spark_default',
    java_class='hello',
    application=srcDir,
    name='airflow-spark-job',
    verbose=True,
    application_args=["{{ti.xcom_pull(task_ids='run_task_1')}}"],
    conf={'master': 'local'},
    dag=dag,
)
task1 >> task2
    for countrycode, countryname in country_dict.items():
        for hslevelcode in hslevel_list:
            for currency in currency_list:

                task1 = SeleniumOperator(
                    script=get_df,
                    script_args=[
                        data_folder, countrycode, hslevelcode, currency
                    ],
                    task_id='Extract_Data_' + countrycode + '_' + hslevelcode +
                    '_' + currency)

                task2 = PythonOperator(
                    task_id='Transform_Data_' + countrycode + '_' +
                    hslevelcode + '_' + currency,
                    # op_kwargs={'countrycode': countrycode},
                    python_callable=transform_data)

                task3 = PythonOperator(task_id='Load_Data_' + countrycode +
                                       '_' + hslevelcode + '_' + currency,
                                       op_kwargs={
                                           'countryname': countryname,
                                           'hslevel': hslevelcode,
                                           'currency': currency,
                                           'countrycode': countrycode
                                       },
                                       python_callable=load_data)

                # Set Dependencies
                task1 >> task2 >> task3
示例#11
0

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': min_10,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
  }


dag = DAG('scrape_cdc', default_args=default_args)

run_this = PythonOperator(
    task_id='print_the_context',
    provide_context=True,
    python_callable=print_context,
    dag=dag)

t1 = BashOperator(
task_id='testairflow',
bash_command=f'python {file_path}',
dag=dag)

t1.set_downstream(run_this)

if __name__ == "__main__":
    dag.cli()
示例#12
0
                             dag=dag)

# Zip Inspector and Extractor Task
zip_task = ZipInspector(task_id='zip_inspector',
                        extension_to_search='tiff',
                        get_inputs_from=download_task.task_id,
                        dag=dag)

warp_tasks = []
addo_tasks = []
upload_tasks = []
band_paths_tasks = []
for i in range(1, 3):
    band_paths = PythonOperator(
        task_id="get_band_paths_" + str(i),
        python_callable=prepare_band_paths,
        op_kwargs={'get_inputs_from': zip_task.task_id},
        dag=dag)
    band_paths_tasks.append(band_paths)

    warp = GDALWarpOperator(task_id='gdalwarp_' + str(i),
                            target_srs=TARGET_SRS,
                            tile_size=TILE_SIZE,
                            overwrite=OVERWRITE,
                            dstdir=S1GRD1SDV.process_dir,
                            get_inputs_from=band_paths.task_id,
                            dag=dag)
    warp_tasks.append(warp)

    addo = GDALAddoOperator(trigger_rule=TriggerRule.ALL_SUCCESS,
                            resampling_method=RESAMPLING_METHOD,
示例#13
0
from datetime import datetime, timedelta
import os
from airflow import DAG
from airflow.operators import PythonOperator
from airflow.hooks.postgres_hook import PostgresHook


def create_table(**context):
    redhift_hook = PostgresHook('redshift')
    queries = open('/home/workspace/airflow/create_tables_capstone.sql',
                   'r').read()
    redhift_hook.run(queries)
    return


default_args = {'owner': 'Stan Taov'}

# Create DAG with previously provided default_args
dag = DAG('dag_tables_capstone',
          default_args=default_args,
          description='Create tables in Redshift',
          start_date=datetime.now(),
          schedule_interval='@once')

# This operator calls create_table function to create tables
create_tables_in_redshift = PythonOperator(task_id='Create_tables_in_redshift',
                                           dag=dag,
                                           provide_context=True,
                                           python_callable=create_table)
示例#14
0
    def execute(self, context):
        log.info('--------------------S1Metadata_PLUGIN running------------')
        task_instance = context['task_instance']

        log.info("Receiving from 'get_input_from':\n{}".format(
            self.get_inputs_from))

        download_task_id = self.get_inputs_from['download_task_id']
        addo_task_ids = self.get_inputs_from['addo_task_ids']
        upload_task_ids = self.get_inputs_from['upload_task_ids']
        archive_product_task_id = self.get_inputs_from[
            'archive_product_task_id']

        downloaded = context['task_instance'].xcom_pull(
            task_ids=download_task_id, key=XCOM_RETURN_KEY)

        local_granules_paths = []
        for tid in addo_task_ids:
            local_granules_path = context['task_instance'].xcom_pull(
                task_ids=tid, key=XCOM_RETURN_KEY)
            if local_granules_path:
                local_granules_paths += local_granules_path
        uploaded_granules_paths = context['task_instance'].xcom_pull(
            task_ids=upload_task_ids, key=XCOM_RETURN_KEY)
        original_package_path = context['task_instance'].xcom_pull(
            task_ids=archive_product_task_id, key=XCOM_RETURN_KEY)
        granules_dict, bbox = collect_granules_metadata(
            local_granules_paths, self.granules_upload_dir, self.bands_dict)

        if not downloaded:
            log.info("No products from Download task, Nothing to do.")
            return list()
        if not local_granules_paths:
            log.info("No local granules from processing, Nothing to do.")
            return list()
        if not uploaded_granules_paths:
            log.info("No uploaded granules from upload task, Nothing to do.")
            return list()
        if not original_package_path:
            log.info(
                "No original package path from original package upload task, Nothing to do."
            )
            return list()

        safe_package_path = downloaded.keys()[0]
        safe_package_filename = os.path.basename(safe_package_path)
        product_id = downloaded[safe_package_path].get('title')
        originalPackageLocation = self.original_package_download_base_url + safe_package_filename
        processing_dir = os.path.join(self.processing_dir, product_id)
        if not os.path.exists(processing_dir):
            os.makedirs(processing_dir)

        log.info('safe_package_path: {}'.format(safe_package_path))
        log.info('local_granules_paths: {}'.format(local_granules_paths))

        s1reader = S1GDALReader(safe_package_path)
        product_metadata = s1reader.get_metadata()
        product_metadata['footprint'] = s1reader.get_footprint()
        log.info(pprint.pformat(product_metadata, indent=4))

        timeStart = product_metadata['ACQUISITION_START_TIME']
        timeEnd = product_metadata['ACQUISITION_STOP_TIME']

        owslinks_dict = create_owslinks_dict(
            product_identifier=product_id,
            timestart=timeStart,
            timeend=timeEnd,
            granule_bbox=bbox,
            gs_workspace=self.gs_workspace,
            gs_wms_layer=self.gs_wms_layer,
            gs_wms_width=self.gs_wms_width,
            gs_wms_height=self.gs_wms_height,
            gs_wms_format=self.gs_wms_format,
            gs_wms_version=self.gs_wms_version,
            gs_wfs_featuretype=self.gs_wfs_featuretype,
            gs_wfs_format=self.gs_wfs_format,
            gs_wfs_version=self.gs_wfs_version,
            gs_wcs_coverage_id=self.gs_wcs_coverage_id,
            gs_wcs_scale_i=self.gs_wcs_scale_i,
            gs_wcs_scale_j=self.gs_wcs_scale_j,
            gs_wcs_format=self.gs_wcs_format,
            gs_wcs_version=self.gs_wcs_version)

        # create thumbnail
        # TODO: create proper thumbnail from quicklook. Also remove temp file
        log.info("Creating thumbnail")
        thumbnail_path = os.path.join(processing_dir, "thumbnail.png")
        quicklook_path = s1reader.get_quicklook()
        log.info(pprint.pformat(quicklook_path))
        copyfile(quicklook_path, thumbnail_path)

        search_params_dict = create_search_dict(product_metadata,
                                                originalPackageLocation)
        log.info(pprint.pformat(search_params_dict))

        metadata_dict = create_metadata_dict(product_metadata)
        log.info(pprint.pformat(metadata_dict))

        description_dict = create_description_dict(product_metadata,
                                                   originalPackageLocation)
        log.info(pprint.pformat(description_dict))

        # create description.html and dump it to file
        log.info("Creating description.html")
        html_description = create_product_description(description_dict)
        search_params_dict['htmlDescription'] = html_description

        # create metadata XML
        log.info("Creating metadata.xml")
        metadata_xml = create_product_metadata(metadata_dict)

        po = PythonOperator(task_id="s1_metadata_dictionary_creation",
                            python_callable=create_procuct_zip,
                            op_kwargs={
                                'processing_dir': processing_dir,
                                'search_params_dict': search_params_dict,
                                'description_html': html_description,
                                'metadata_xml': metadata_xml,
                                'granules_dict': granules_dict,
                                'owslinks_dict': owslinks_dict,
                                'thumbnail_path': thumbnail_path
                            })

        out = po.execute(context)
        zip_paths = list()
        if out:
            zip_paths.append(out)
        return zip_paths
示例#15
0
    lines = text.split("\n")
    return tuple("('" + x.split(",")[0] + "','" + x.split(",")[1] + "')" for x in lines[1:] if x != "") # index 1부터 시작하게 하여 header 무시, 빈 문자열 제거

def load(**kwargs):
    cur = get_Redshift_connection()
    lines = kwargs["ti"].xcom_pull(key="return_value", task_ids="perform_transform")
    sql = """BEGIN; 
             TRUNCATE TABLE ysjune1051.name_gender; 
             INSERT INTO ysjune1051.name_gender VALUES {lines}; 
             END;""".format(lines=",".join(lines))
    cur.execute(sql)

task_extract = PythonOperator(
	task_id = "perform_extract",
	python_callable = extract, 
    op_kwargs={"url": "https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv"}, 
    provide_context=True,  
	dag = dag_second_assignment
    )

task_transform = PythonOperator(
	task_id = "perform_transform",
	python_callable = transform, 
    provide_context=True,  
	dag = dag_second_assignment
    )

task_load = PythonOperator(
	task_id = "perform_load",
	python_callable = load, 
    provide_context=True, 
示例#16
0
    ]
    time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    with open(
            "/home/tmadmin/NITIN/VIPUL/logs/provis/comparision-provis-%s.csv" %
        (time_str), "wb") as out:
        wr = csv.writer(out)
        for datalist in all_data_sv:
            wr.writerow(datalist)

    with open("/home/tmadmin/NITIN/VIPUL/summary-provis.csv", "ab") as out:
        wr2 = csv.writer(out)
        wr2.writerow(summary)

test_ul_issue_kpi = PythonOperator(task_id="test_ul_issue",
                                   provide_context=True,
                                   python_callable=test_ul_issue_kpi,
                                   dag=test_ul_issue_dag,
                                   queue=Q_PUBLIC)

test_provis_kpi = PythonOperator(task_id="test_provision_kpi",
                                 provide_context=True,
                                 python_callable=test_provis_kpi,
                                 dag=test_ul_issue_dag,
                                 queue=Q_PUBLIC)

ULISSUE = ExternalTaskSensor(
    external_dag_id="UL_ISSUE_KPI.StarmaxIDU",
    external_task_id="aggregate_ul_issue_bs_ospf1",
    task_id="sense_ospf1_ul_issue",
    poke_interval=20,
    trigger_rule='all_done',
    'retries':
    1,
    'retry_delay':
    timedelta(minutes=15),
    'scheduler_interval':
    '@daily'
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('racer-nightly-to-s3', default_args=default_args)


def download_google_sheet(ds, **kwargs):
    '''This is a function that will run within the DAG execution'''
    print("running google sheet")
    df = pd.read_csv(
        'https://docs.google.com/spreadsheets/d/1vYg2lvDL9Q4ddzUgKVTvTyCUcF2EgnNsxNwffHo28lo/export?gid=0&format=csv'
    )
    execution_date = kwargs['execution_date'].date()
    df.to_csv('~/shelters-' + str(execution_date) + '.csv')
    return df


run_this = PythonOperator(task_id='download_google_sheet',
                          provide_context=True,
                          python_callable=download_google_sheet,
                          dag=dag)
          schedule_interval=None)


def my_sleeping_function(random_base):
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'


run_this = PythonOperator(task_id='print_the_context',
                          provide_context=True,
                          python_callable=print_context,
                          dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(task_id='sleep_for_' + str(i),
                          python_callable=my_sleeping_function,
                          op_kwargs={'random_base': float(i) / 10},
                          dag=dag)

    task.set_upstream(run_this)
from airflow.operators import DummyOperator, PythonOperator
from datetime import datetime, timedelta
import airflow.hooks.S3_hook
from helper import t1, t2, t3, t4
#from helper import upload_file_to_S3_with_hook

default_args = {
    'owner': 'airflow',
    'start_date': datetime(2020, 9, 24),
    'retry_delay': timedelta(seconds=5)
}
# Using the context manager alllows you not to duplicate the dag parameter in each operator
with DAG('S3_dag_test', default_args=default_args,
         schedule_interval='@once') as dag:

    t1 = PythonOperator(task_id="data_collection", python_callable=t1, dag=dag)

    t2 = PythonOperator(task_id="downlaod_from_s3",
                        python_callable=t2,
                        dag=dag)

    t3 = PythonOperator(task_id="transform", python_callable=t3, dag=dag)

    t4 = PythonOperator(task_id="upload_to_salesforce1",
                        python_callable=t4,
                        dag=dag)

    t1 >> t2 >> t3 >> t4

    #t4
    '''
示例#20
0
from airflow import DAG
from airflow.operators import BashOperator, PythonOperator
from datetime import datetime, timedelta
from load_stock_data import load_koersen_from_ASN

# Following are defaults which can be overridden later on
default_args = {
    'owner': 'mattooren',
    'depends_on_past': False,
    'start_date': datetime(2019, 12, 23),
    'schedule_interval': '0 18 * * *',
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('ASN_Koersen', default_args=default_args)

# t1, t2, t3 and t4 are examples of tasks created using operators

t5 = PythonOperator(task_id='koersen_loader',
                    provide_context=True,
                    python_callable=load_koersen_from_ASN,
                    dag=dag)
示例#21
0
                logging.error("Succeessfully Updated Age")
            else:
                logging.info("recieved f*****g None")

        except Exception:
            logging.error(
                "Unable to actually insert te updated data into redis")

    except Exception:
        logging.info("Unable to get latest refer")
        traceback.print_exc()


initiate_dag = PythonOperator(task_id="Initiate",
                              provide_context=False,
                              python_callable=init_kpi,
                              dag=ul_issue_dag,
                              queue=Q_PUBLIC)

for machine in machines:
    for site in machine.get('sites'):
        site_name = site.get('name')
        all_sites.append(site_name)
        #site = eval(site)
        #print site.get('name')

for technology in technologies:
    CHILD_DAG_NAME = "%s" % (technology)
    #logging.info("For Tech %s"%(technology))
    ul_subdag_task = SubDagOperator(subdag=process_ul_issue_kpi(
        PARENT_DAG_NAME, CHILD_DAG_NAME, datetime(2017, 2, 24),
示例#22
0
    'product/product.json', 'product/granules.json', 'product/thumbnail.jpeg',
    'product/owsLinks.json'
]

product_zip_task = Sentinel2ProductZipOperator(
    task_id='create_product_zip_task',
    target_dir=S2MSIL1C.download_dir,
    generated_files=generated_files_list,
    placeholders=placeholders_list,
    get_inputs_from=download_task.task_id,
    dag=dag)

# curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products"
publish_task = PythonOperator(task_id="publish_product_task",
                              python_callable=publish_product,
                              op_kwargs={
                                  'geoserver_username':
                                  CFG.geoserver_username,
                                  'geoserver_password':
                                  CFG.geoserver_password,
                                  'geoserver_rest_endpoint':
                                  '{}/oseo/collections/{}/products'.format(
                                      CFG.geoserver_rest_url,
                                      S2MSIL1C.geoserver_oseo_collection),
                                  'get_inputs_from':
                                  product_zip_task.task_id,
                              },
                              dag=dag)

search_task >> download_task >> archive_task >> thumbnail_task >> metadata_task >> archive_wldprj_task >> product_zip_task >> publish_task
示例#23
0
def load(lines):
    logging.info("load started")
    cur = get_Redshift_connection()
    sql = "BEGIN;DELETE FROM TABLE raw_data.name_gender;"
    for l in lines:
        if l != '':
            (name, gender) = l.split(",")
            sql += "INSERT INTO raw_data.name_gender VALUES ('{name}', '{gender}');"
    sql += "END;"
    cur.execute(sql)
    logging.info(sql)
    logging.info("load done")


def etl():
    link = "https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv"
    data = extract(link)
    lines = transform(data)
    load(lines)


dag_second_assignment = DAG(
    dag_id='second_assignment',
    start_date=datetime(2020, 8, 10),  # 날짜가 미래인 경우 실행이 안됨
    schedule_interval='0 2 * * *')  # 적당히 조절

task = PythonOperator(task_id='perform_etl',
                      python_callable=etl,
                      dag=dag_second_assignment)
示例#24
0
from airflow import DAG
from airflow.operators import PythonOperator, DummyOperator
from datetime import timedelta, datetime
from get_sensor_data import get_sensor_data

default_args = {
    'owner': 'ubuntu',
    'depends_on_past': False,
    'start_date': datetime(2020, 9, 28),
    'retries': 3,
    'retry_delay': timedelta(seconds=30)
}

with DAG(dag_id='purpleair_sensors',
         description="Request PurpleAir sensor data",
         default_args=default_args,
         schedule_interval=timedelta(minutes=5),
         catchup=False) as dag:

    t1 = DummyOperator(task_id='dummy_task')
    t2 = PythonOperator(task_id='purpleair_api',
                        python_callable=get_sensor_data)

    t1 >> t2
示例#25
0
    :param List[Dict[str, object]] test_scenarios:  [Required] ID for the task
    :return: str
    """
    json_encoded = json.dumps(test_scenarios)
    utf_encoded = json_encoded.encode("utf-8")
    b64_encoded = base64.b64encode(utf_encoded).decode("utf-8")
    return b64_encoded


with models.DAG(
    "burnham", schedule_interval="@daily", default_args=DEFAULT_ARGS,
) as dag:

    # Generate a UUID for this test run
    generate_burnham_test_run_uuid = PythonOperator(
        task_id="generate_burnham_test_run_uuid",
        python_callable=lambda: str(uuid.uuid4()),
    )
    burnham_test_run = '{{ task_instance.xcom_pull("generate_burnham_test_run_uuid") }}'

    # This Airflow macro is added to sensors to filter out rows by submission_timestamp
    start_timestamp = "{{ dag_run.start_date.isoformat() }}"

    # Create burnham clients that complete missions and submit pings
    client1 = burnham_run(
        task_id="client1",
        burnham_test_run=burnham_test_run,
        burnham_test_name=DEFAULT_TEST_NAME,
        burnham_missions=["MISSION G: FIVE WARPS, FOUR JUMPS", "MISSION C: ONE JUMP"],
        burnham_spore_drive="tardigrade",
        owner=DAG_OWNER,
        email=DAG_EMAIL,
DAG_NAME = 'External_Task_Sensor_Test'
args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(10)}

dag = DAG(
    dag_id=DAG_NAME,
    catchup=False,
    default_args=args,
    schedule_interval='25 10 * * *',
)


def dummy_call(**kwargs):
    return "Nothing to do.."


start_task = PythonOperator(task_id='start_task',
                            python_callable=dummy_call,
                            provide_context=True,
                            dag=dag)

external_task_dependent = ExternalTaskSensor(task_id='dependent_on_dag_2_task',
                                             external_dag_id='XCOM_TEST',
                                             external_task_id='run_file_2',
                                             dag=dag)

end_task = PythonOperator(task_id='end_task',
                          python_callable=dummy_call,
                          provide_context=True,
                          dag=dag)

start_task >> external_task_dependent >> end_task
示例#27
0
from application.scheduled.prune import remove_stale_artifacts

from airflow import DAG
from airflow.operators import PythonOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2017, 7, 1)
}

dag = DAG('remove_stale_artifacts', default_args=default_args)

remove_stale_artifacts_task = PythonOperator(
    task_id='remove_stale_artifacts_task',
    provide_context=False,
    python_callable=remove_stale_artifacts,
    dag=dag)
示例#28
0
        args.append("--junit_path=" + junit_path)
        args.append("--project=" + GCB_PROJECT)

        # We want subprocess output to bypass logging module otherwise multiline
        # output is squashed together.
        util.run(args, use_print=True, dryrun=dryrun)

    return run_py_checks


def done(**_kwargs):
    logging.info("Executing done step.")


clone_op = PythonOperator(task_id='clone_repo',
                          provide_context=True,
                          python_callable=clone_repo,
                          dag=dag)

build_op = PythonOperator(task_id='build_images',
                          provide_context=True,
                          python_callable=build_images,
                          dag=dag)
build_op.set_upstream(clone_op)

py_lint_op = PythonOperator(task_id='pylint',
                            provide_context=True,
                            python_callable=py_checks_gen("lint"),
                            dag=dag)
py_lint_op.set_upstream(clone_op)

py_test_op = PythonOperator(task_id='pytest',
示例#29
0
    f2 = df['spent'].values
    X = np.array(list(zip(f1, f2)))
    kmeans = KMeans(n_clusters=5).fit(X)
    log.info(kmeans.cluster_centers_)


dag = DAG('my_sql_test_dag',
          description='SQL tutorial DAG',
          schedule_interval='0 12 * * *',
          start_date=datetime(2018, 3, 20),
          catchup=False)


simple_select_mysql_task = \
    PythonOperator(task_id='simple_select_mysql',
                   provide_context=True,
                   python_callable=simple_select_mysql,
                   dag=dag)

simple_elbow_data_task = PythonOperator(task_id='simple_elbow_data_task',
                                        provide_context=True,
                                        python_callable=simple_elbow_data,
                                        dag=dag)

simple_kmeans_data_task = PythonOperator(task_id='simple_kmeans_data_task',
                                         provide_context=True,
                                         python_callable=simple_kmeans_data,
                                         dag=dag)

simple_select_mysql_task >> simple_elbow_data_task
simple_select_mysql_task >> simple_kmeans_data_task
示例#30
0
    #Delete current rows in target table table
    snql_hook.run("""DELETE FROM dim_sneakers""")

    #Insert new rows into target table
    snql_hook.insert_rows('dim_sneakers', staging_results)


def delete_from_staging():

    staging_hook = PostgresHook('snql_staging')
    staging_hook.run('DELETE FROM dim_sneakers;')


create_staging_table = PythonOperator(task_id='create_staging_table',
                                      python_callable=create_staging,
                                      email_on_failure=True,
                                      email='*****@*****.**',
                                      dag=dag)

pull_and_insert_to_staging = SQLTemplatedPythonOperator(
    task_id='pull_and_insert_to_staging',
    templates_dict={'query': 'dim_sneakers/extract.sql'},
    python_callable=pull_and_insert,
    params={'ds': datetime.utcnow()},
    email_on_failure=True,
    email='*****@*****.**',
    provide_context=True,
    dag=dag)

create_target_table = PythonOperator(task_id='create_target_table',
                                     python_callable=create_target,