conn = sqlite.get_conn()
    query = "select * from twitter_terms"
    df = pd.read_sql_query(query, conn)
    return random.choice([
        'search_{}_twitter'.format(re.sub(r'\W+', '', t))
        for t in df.search_term.values])


fill_search_terms = PythonOperator(task_id='fill_terms',
                                   provide_context=True,
                                   python_callable=fill_terms,
                                   dag=dag)


gen_search_terms = BranchPythonOperator(task_id='generate_search_terms',
                                        provide_context=True,
                                        python_callable=generate_search_terms,
                                        dag=dag)


email_links = EmailOperator(task_id='email_best_links',
                            to='*****@*****.**',
                            subject='Latest popular links',
                            html_content='Check out the latest!!',
                            files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
                            dag=dag)


sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)
Пример #2
0
def _get_task_id(execution_date, **context):
    return 'email_' + weekday_person_to_email[execution_date.weekday()]


def _print_weekday(execution_date: datetime, **context):
    print(execution_date.strftime('%a'))


with dag:
    print_weekday = PythonOperator(
        task_id='print_weekday',
        python_callable=_print_weekday,
        provide_context=True,
    )

    branching = BranchPythonOperator(
        task_id='branching',
        python_callable=_get_task_id,
        provide_context=True,
    )

    users = ['bob', 'alice', 'joe']

    branches = [DummyOperator(task_id='email_' + user) for user in users]

    end = BashOperator(task_id='end',
                       bash_command='echo "That\'s it folks!"',
                       trigger_rule=TriggerRule.ONE_SUCCESS)

    print_weekday >> branching >> branches >> end
Пример #3
0
from airflow.operators import BranchPythonOperator, DummyOperator
from airflow.models import DAG
from datetime import datetime, timedelta
import random

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_branch_operator', default_args=args)

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

for option in options:
    t = DummyOperator(task_id=option, dag=dag)
    t.set_upstream(branching)
    dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag)
    t.set_downstream(dummy_follow)
        else:
            df.at[i, 'status'] = status
            print("Failed record {}, message - {}".format(i, status))

    return ('success')


with airflow.DAG('arya_kx_delta_load',
                 'catchup=False',
                 default_args=default_args,
                 schedule_interval=datetime.timedelta(hours=12)) as dag:

    Start = DummyOperator(task_id='Start')

    is_new_delta_file = BranchPythonOperator(task_id='is_new_delta_file',
                                             provide_context=True,
                                             python_callable=delta_file_check,
                                             trigger_rule="all_done",
                                             dag=dag)

    load_delta_files = PythonOperator(task_id='load_delta_files',
                                      provide_context=True,
                                      python_callable=load_data,
                                      dag=dag)

    End = DummyOperator(task_id='End')

    Start >> is_new_delta_file
    is_new_delta_file >> [load_delta_files, End]
    load_delta_files >> End
            logging.info(
                "Opting to send an email to alert the users that processes were killed"
            )
            return send_processes_killed_email.task_id
        else:
            logging.info("enable_kill is set to False")
    else:
        logging.info("Processes to kill list was either None or Empty")

    logging.info(
        "Opting to skip sending an email since no processes were killed")
    return ""


email_or_not_branch = BranchPythonOperator(task_id="email_or_not_branch",
                                           python_callable=branch_function,
                                           provide_context=True,
                                           dag=dag)

send_processes_killed_email = EmailOperator(
    task_id="send_processes_killed_email",
    to=PROCESS_KILLED_EMAIL_ADDRESSES,
    subject=PROCESS_KILLED_EMAIL_SUBJECT,
    html_content="""
    <html>
        <body>

            <h2>Dag Run Information</h2>
            <table>
                <tr><td><b> ID: </b></td><td>{{ dag_run.id }}</td></tr>
                <tr><td><b> DAG ID: </b></td><td>{{ dag_run.dag_id }}</td></tr>
                <tr><td><b> Execution Date: </b></td><td>{{ dag_run.execution_date }}</td></tr>
Пример #6
0
第2列表示小时1~23(0表示0点)
第3列表示日期1~31
第4列表示月份1~12
第5列标识号星期0~6(0表示星期天)
第6列要运行的命令
'''

dag = DAG('source_data_count',
          default_args=default_args,
          schedule_interval="0 12 * * *")

run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

branching = BranchPythonOperator(task_id='branching',
                                 python_callable=lambda: 'source_count'
                                 if datetime.now().day <= 7 and datetime.today(
                                 ).weekday() == 6 else 'ignore_not_sunday',
                                 dag=dag)
branching.set_upstream(run_this_first)

esucc = EmailOperator(task_id='email_success_' + dag.dag_id,
                      to=email_addr,
                      subject=dag.dag_id + ' [success] on ' +
                      datetime.now().strftime('%Y-%m-%d'),
                      html_content='Congratulation!',
                      trigger_rule='all_success',
                      dag=dag)

source_count = BashOperator(
    task_id='source_count',
    bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ',
Пример #7
0
from airflow.operators import BranchPythonOperator, DummyOperator
from airflow.utils import chain
from datetime import datetime, timedelta
import random

yesterday = datetime.combine(datetime.today() - timedelta(7),
                             datetime.min.time())

default_args = {
    'owner': 'airflow',
    'start_date': yesterday,
}

dag = DAG('branch', default_args=default_args)

t1 = DummyOperator(task_id='task1', dag=dag)

b1 = DummyOperator(task_id='branch1', dag=dag)
b2 = DummyOperator(task_id='branch2', dag=dag)
b3 = DummyOperator(task_id='branch3', dag=dag)

select = BranchPythonOperator(
    task_id='select',
    python_callable=lambda: random.choice(['branch1', 'branch2', 'branch3']),
    dag=dag)

chain(t1, select)
chain(select, b1)
chain(select, b2)
chain(select, b3)
    dag=dag)

t3 = PythonOperator(
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)


join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)


t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
Пример #9
0
    conn = sqlite.get_conn()
    query = "select * from twitter_terms"
    df = pd.read_sql_query(query, conn)
    return random.choice([
        'search_{}_twitter'.format(re.sub(r'\W+', '', t))
        for t in df.search_term.values
    ])


fill_search_terms = PythonOperator(task_id='fill_terms',
                                   provide_context=True,
                                   python_callable=fill_terms,
                                   dag=dag)

gen_search_terms = BranchPythonOperator(task_id='generate_search_terms',
                                        provide_context=True,
                                        python_callable=generate_search_terms,
                                        dag=dag)

email_links = EmailOperator(
    task_id='email_best_links',
    to='*****@*****.**',
    subject='Latest popular links',
    html_content='Check out the latest!!',
    files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
    dag=dag)

sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)
}

# BranchPython operator that depends on past
# and where tasks may run or be skipped on
# alternating runs
dag = DAG(dag_id='example_branch_dop_operator_v3',
          schedule_interval='*/1 * * * *',
          default_args=args)


def should_run(ds, **kwargs):

    print("------------- exec dttm = {} and minute = {}".format(
        kwargs['execution_date'], kwargs['execution_date'].minute))
    if kwargs['execution_date'].minute % 2 == 0:
        return "oper_1"
    else:
        return "oper_2"


cond = BranchPythonOperator(task_id='condition',
                            provide_context=True,
                            python_callable=should_run,
                            dag=dag)

oper_1 = DummyOperator(task_id='oper_1', dag=dag)
oper_1.set_upstream(cond)

oper_2 = DummyOperator(task_id='oper_2', dag=dag)
oper_2.set_upstream(cond)
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes,
               default_args):

    dag = DAG(dag_id,
              schedule_interval=schedule,
              start_date=start_date,
              default_args=default_args)

    dag.doc_md = """
    # DAG fetching data from smiles.com.ar
    ### procesing and dumping on postgresql
    """
    """start = TimeDeltaSensor(
        task_id='wait_to_start',
        delta=timedelta(minutes=delta_sensor),
        dag=dag)"""

    start = DummyOperator(task_id="start", dag=dag)

    branches = []

    def return_dates_branches(**kwargs):
        return branches

    gen_url_branch = BranchPythonOperator(
        task_id='generate_url_dates',
        provide_context=True,
        python_callable=return_dates_branches,
        dag=dag)

    def transform_data(**kwargs):
        ti = kwargs['ti']
        raw_data = ti.xcom_pull(task_ids=return_dates_branches())
        data = []
        logging.info(raw_data)
        if raw_data is not None:
            flat_list = [item for sublist in raw_data for item in sublist]
            for row in flat_list:
                row = list(row)
                # add À-ÿ for spanish accents
                date = '/'.join(
                    list(
                        re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split(
                            row[1]))[2:4])
                date = dateparser.parse(date,
                                        languages=['pt', 'es'],
                                        date_formats=['%d/%b'
                                                      ]).strftime('%Y-%m-%d')
                row[1] = date
                td = row[4].split(':')
                row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1])))
                row[5] = int(row[5].replace('.', ''))
                row[6] = int(row[6].replace('.', ''))
                row[8] = row[8].split(' ')[-1]
                row.insert(0, datetime.now().strftime('%Y-%m-%d'))
                data.append(tuple(row))
            return data
        else:
            print('No se recibio datos')

    t2 = PythonOperator(
        task_id='transform_data',
        python_callable=transform_data,
        depends_on_past=True,
        trigger_rule=TriggerRule.ALL_SUCCESS,
        provide_context=True,
        dag=dag,
    )

    t2.doc_md = """
    #### Task Documentation
    Transform fetched data
    @return a list of tuples
    """

    # def gen_url_dates(**kwargs):
    date_start = read_scraped_date(airpots_codes)
    date_end = date_start + timedelta(days=AMOUNT_DAYS)
    date_generated = [
        date_start + timedelta(days=x)
        for x in range(0, (date_end - date_start).days)
    ]

    for i, date in enumerate(date_generated):
        date_ml = str(date.timestamp())[:8] + '00000'
        url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3&currencyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format(
            airpots_codes[0][0], airpots_codes[1], date_ml, date_ml,
            airpots_codes[0][1], airpots_codes[1])

        get_data_op = PythonOperator(
            task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0],
                                                     airpots_codes[0][1],
                                                     airpots_codes[1], i),
            python_callable=get_data_URL,
            op_kwargs={'URL': url_dated},
            trigger_rule=TriggerRule.ONE_SUCCESS,
            provide_context=True,
            dag=dag,
        )
        branches.append(get_data_op.task_id)
        get_data_op.set_upstream(gen_url_branch)
        get_data_op.set_downstream(t2)
        get_data_op.doc_md = """
        #### Task Documentation
        Fetch data from passed url
        return list of semi-parsed data
        """

    insert_data = PythonOperator(
        task_id='insert_data',
        python_callable=insert_into_table,
        provide_context=True,
        dag=dag,
    )

    insert_data.doc_md = """
    #### Task Documentation
    Insert parsed and transformed data into table
    """
    t2.set_downstream(insert_data)
    gen_url_branch.set_upstream(start)

    return dag
Пример #12
0
	#s.connect((socket_ip, socket_port))
	s.connect(socket_path)
	s.settimeout(60.0)
	s.send(query)
	s.shutdown(socket.SHUT_WR)
	output = ''
	while True:
		try:
			out = s.recv(100000000)
			out.strip()
	     	except socket.timeout,e:
			err=e.args[0]
			print 'socket timeout ..Exiting'
			if err == 'timed out':
				sys.exit(1) 
		
	     		if not len(out):
				break
	     	output += out

	return output


extract_data = BranchPythonOperator(
    task_id="ExtractData",
    provide_context=False,
    python_callable=connect_and_extract,
    dag=main_service_dag)