Пример #1
0
def not_pay_push(**op_kwargs):
    dt = op_kwargs.get('ds')
    env = op_kwargs.get('env', 'prod')
    lagos_9_clock_timestamp = get_lagos_timestamp(dt)
    cursor = get_hive_cursor()
    table_name = 'data_order'
    table_name2 = 'data_user_whitelist'
    if env == 'test':
        table_name += '_dev'
        table_name2 += '_dev'
    cursor.execute("msck repair table oride_db.%s" % table_name)
    cursor.execute("msck repair table oride_db.%s" % table_name2)
    cursor.execute(
        not_pay_hql.format(table_name=table_name,
                           table_name2=table_name2,
                           dt=dt))
    res = [x[0] for x in cursor.fetchall()]
    print("not pay order ids: %d" % len(res))
    step = 100
    db_name = 'sqoop_db'
    if env == 'test':
        db_name += '_test'
    mysql_cursor = get_db_conn(db_name).cursor()
    uids = set()
    for i in range(0, len(res), step):
        tmp = [str(x) for x in res[i:i + step]]
        sql = not_pay_sql.format(ids=','.join(tmp))
        mysql_cursor.execute(sql)
        data = mysql_cursor.fetchall()
        for rec in data:
            uids.add(rec[0])
    print("not pay user ids: %d" % len(uids))
    print(uids)
    for uid in uids:
        send_push(env, 1, uid, lagos_9_clock_timestamp, "not_pay")
Пример #2
0
def hiveresult_to_channel_mysql(ds, **kwargs):
    cursor = get_hive_cursor()
    logging.info(kwargs['sql'].format(ds=ds))
    cursor.execute(kwargs['sql'].format(ds=ds))
    results = cursor.fetchall()
    mysql_conn = get_db_conn('opay_spread_mysql')
    mcursor = mysql_conn.cursor()
    sql_insert = kwargs['sql_insert']
    sql_val = ''
    sql_ext = kwargs['sql_ext']
    sql_count = 0
    for day, channel, driver_type, drivers in results:
        sql_tmp = "('{day}', '{channel}', '{driver_type}', '{dirvers}')".format(
            day=day, channel=channel, driver_type=driver_type, dirvers=drivers)
        if sql_val == '':
            sql_val = sql_tmp
        else:
            sql_val += ',' + sql_tmp
        sql_count += 1
        if sql_count >= 1000:
            sql = sql_insert + ' ' + sql_val + ' ' + sql_ext
            # logging.info(sql)
            mcursor.execute(sql)
            sql_count = 0
            sql_val = ''

    if sql_count > 0:
        sql = sql_insert + ' ' + sql_val + ' ' + sql_ext
        mcursor.execute(sql)

    mysql_conn.commit()
    cursor.close()
    mcursor.close()
    mysql_conn.close()
Пример #3
0
def validate_partition(*op_args, **op_kwargs):
    cursor = get_hive_cursor()
    dt = op_kwargs['ds']
    table_names = op_kwargs['table_names']
    task_name = op_kwargs['task_name']
    for table_name in table_names:
        sql = '''
            show partitions {table_name}
        '''.format(
            table_name=table_name
        )

        cursor.execute(sql)
        res = cursor.fetchall()

        flag = False
        for partition in res:
            if str(partition[0]).find(dt) > -1:
                flag = True
                break

        if not flag:
            comwx.postAppMessage('{table_name} : {dt} 分区不存在 , {task_name} 任务终止执行'.format(
                table_name=table_name,
                dt=dt,
                task_name=task_name
            ), '271')
Пример #4
0
def abnormal_push(**op_kwargs):
    dt = op_kwargs.get('ds')
    env = op_kwargs.get('env', 'prod')
    lagos_9_clock_timestamp = get_lagos_timestamp(dt)
    cursor = get_hive_cursor()
    table_record = 'data_driver_recharge_records'
    table_abnormal = 'data_abnormal_order'
    table_white = 'data_driver_whitelist'
    if env == 'test':
        table_record += '_dev'
        table_abnormal += '_dev'
        table_white += '_dev'
    cursor.execute("msck repair table oride_db.%s" % table_record)
    cursor.execute("msck repair table oride_db.%s" % table_abnormal)
    cursor.execute("msck repair table oride_db.%s" % table_white)
    cursor.execute(
        abnormal_sql.format(table_record=table_record,
                            table_white=table_white,
                            table_abnormal=table_abnormal,
                            dt=dt))
    abnormal_drivers = [x[0] for x in cursor.fetchall()]
    print("abnormal order related drivers: %d" % len(abnormal_drivers))
    print(abnormal_drivers)
    for did in abnormal_drivers:
        send_push(env, 2, did, lagos_9_clock_timestamp, "deduct")
Пример #5
0
def get_location(hive_db, hive_table):
    """
        读取hive 表location
    """

    location = None

    hive_cursor = get_hive_cursor()
    hql = '''
        DESCRIBE FORMATTED {db}.{table} 
    '''.format(db=hive_db, table=hive_table)
    #logging.info(hql)
    hive_cursor.execute(hql)
    res = hive_cursor.fetchall()

    for (col_name, col_type, col_comment) in res:
        col_name = col_name.lower().strip()
        if col_name == 'location:':
            location = col_type
            break

    if location is None:
        return None

    return location
Пример #6
0
def is_alert(dt, table_names):
    cursor = get_hive_cursor()
    template = "'{table_name}',"
    table_list = ''

    for table_name in table_names:
        if str(table_name).find('.') > -1:
            table_name = str(table_name).split('.')[1]
        table_list += template.format(table_name=table_name)

    table_list = table_list[0:len(table_list) - 1]

    sql = '''
        select 
        count(1)
        from 
        oride_bi.oride_meta_import_data
        where dt = '{dt}'
        and table_name in ({table_list})
        and is_import = 0
    '''.format(dt=dt,
               table_list=table_list)

    logging.info(sql)

    cursor.execute(sql)
    res = cursor.fetchall()
    result = int(res[0][0])

    return result
Пример #7
0
    def get_country_code(self):

        cursor = get_hive_cursor()

        #获取二位国家码
        get_sql = '''

        select concat_ws(',',collect_set(country_code)) as country_code from {db}.{table} WHERE dt='{pt}'
    
        '''.format(pt=self.ds, table=self.table_name, db=self.db_name)

        cursor.execute(get_sql)

        res = cursor.fetchone()

        if len(res[0]) > 1:
            country_code_list = res[0]

            logging.info('Executing 二位国家码: %s', country_code_list)

        else:

            country_code_list = "nal"

            logging.info('Executing 二位国家码为空,赋予默认值 %s', country_code_list)

        return country_code_list
Пример #8
0
def check_key_data_task(ds):
    # 主键重复校验
    HQL_DQC = '''
    SELECT count(1) as nm
    FROM
     (SELECT order_id,
             count(1) as cnt
      FROM oride_dw.{table}

      WHERE dt='{pt}'
      GROUP BY order_id HAVING count(1)>1) t1
    '''.format(
        pt=ds,
        now_day=airflow.macros.ds_add(ds, +1),
        table=table_name
    )

    cursor = get_hive_cursor()
    logging.info('Executing 主键重复校验: %s', HQL_DQC)

    cursor.execute(HQL_DQC)
    res = cursor.fetchone()

    if res[0] > 1:
        raise Exception("Error The primary key repeat !", res)
    else:
        print("-----> Notice Data Export Success ......")
Пример #9
0
def check_key_data_task(ds):
    cursor = get_hive_cursor()

    # 主键重复校验
    check_sql = '''
    SELECT count(1)-count(distinct (concat(order_id,'_',user_id))) as cnt
      FROM {db}.{table}
      WHERE dt='{pt}'
      and country_code in ('nal')
    '''.format(pt=ds,
               now_day=airflow.macros.ds_add(ds, +1),
               table=table_name,
               db=db_name)

    logging.info('Executing 主键重复校验: %s', check_sql)

    cursor.execute(check_sql)

    res = cursor.fetchone()

    if res[0] > 1:
        flag = 1
        raise Exception("Error The primary key repeat !", res)
        sys.exit(1)
    else:
        flag = 0
        print("-----> Notice Data Export Success ......")

    return flag
Пример #10
0
def csresult_channel_to_mysql(ds, **kwargs):
    cursor = get_hive_cursor()
    logging.info(cssql.format(ds=ds))
    cursor.execute(cssql.format(ds=ds))
    results = cursor.fetchall()
    mysql_conn = get_db_conn('opay_spread_mysql')
    mcursor = mysql_conn.cursor()

    sql_insert = '''
        INSERT INTO promoter_order_day (
            dt, driver_id, driver_type, name, mobile, city_id, distance, income, online_paid, online_total, total_orders,
            arrived_orders, total_comments, bad_comments, total_score, online_time
        ) VALUES
    '''
    sql_ext = '''
        ON DUPLICATE KEY UPDATE 
    '''
    sql_val = ''
    sql_count = 0
    for driver_id, dt, name, phone, city, type, distance, income, onlineSettlement, onlineTotal, total_orders, arrived_orders, comment, badcomments_num, score, onlinetime in results:
        sql_tmp = '''
            ('{dt}', '{driver_id}', '{driver_type}', '{name}', '{mobile}', '{city_id}', '{distance}', '{income}', '{online_paid}', '{online_total}', '{total_orders}', '{arrived_orders}', '{total_comments}', '{bad_comments}', '{total_score}', '{online_time}')
        '''.format(dt=dt,
                   driver_id=driver_id,
                   driver_type=type,
                   name=name.replace("\\", "").replace("'", "\\'"),
                   mobile=phone,
                   city_id=city,
                   distance=distance,
                   income=income,
                   online_paid=onlineSettlement,
                   online_total=onlineTotal,
                   total_orders=total_orders,
                   arrived_orders=arrived_orders,
                   total_comments=comment,
                   bad_comments=badcomments_num,
                   total_score=score,
                   online_time=onlinetime)

        if sql_val == '':
            sql_val = sql_tmp
        else:
            sql_val += ',' + sql_tmp
        sql_count += 1
        if sql_count >= 1000:
            sql = sql_insert + ' ' + sql_val
            mcursor.execute(sql)
            sql_count = 0
            sql_val = ''

    if sql_count > 0:
        sql = sql_insert + ' ' + sql_val
        mcursor.execute(sql)

    mysql_conn.commit()
    cursor.close()
    mcursor.close()
    mysql_conn.close()
def get_data_from_hive(ds, execution_date, **op_kwargs):
    # ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400)))
    hql = '''
        SELECT 
            create_date_hour , 
            sub_service_type , 
            state , 
            region , 
            order_status , 
            order_cnt , 
            order_amt,
            country_code , 
            dt , 
            hour
        from opay_dw.app_opay_cico_sum_ng_h
        where 
        country_code = 'NG'
    and concat(dt,' ',hour) >= date_format(default.localTime("{config}", 'NG', '{v_date}', -1), 'yyyy-MM-dd HH')
    and concat(dt,' ',hour) <= date_format(default.localTime("{config}", 'NG', '{v_date}', 0), 'yyyy-MM-dd HH')

    '''.format(
        pt=ds,
        v_date=execution_date.strftime("%Y-%m-%d %H:%M:%S"),
        config=config
    )

    logging.info(hql)
    hive_cursor = get_hive_cursor()
    hive_cursor.execute(hql)
    hive_data = hive_cursor.fetchall()

    mysql_conn = get_db_conn('mysql_bi')
    mcursor = mysql_conn.cursor()

    __data_only_mysql(
        mcursor,
        execution_date
    )

    __data_to_mysql(
        mcursor,
        hive_data,
        [
            'create_date_hour',
            'sub_service_type',
            'state',
            'region',
            'order_status',
            'order_cnt',
            'order_amt',
            'country_code',
            'dt',
            'hour'
        ]
    )

    hive_cursor.close()
    mcursor.close()
Пример #12
0
def send_bdm_dim_file_email(ds, ds_nodash, **kwargs):
    cursor = get_hive_cursor()
    sql = """
        select  
        dt,
        area_name,
        --points,
        bdm_name,
        hbdm_name,
        take_time_avg,
        delivery_time_avg,
        score_peisong_avg,
        cancel_order_cnt,
        concat(cast(nvl(round(sys_cancel_order_cnt * 100 / cancel_order_cnt,1),0) as string),'%'),
        concat(cast(nvl(round(user_cancel_order_cnt * 100/cancel_order_cnt,1),0) as string),'%'),
        concat(cast(nvl(round(merchant_cancel_order_cnt * 100/cancel_order_cnt,1),0) as string),'%')
        
        from ofood_bi.ofood_bdm_area_metrics_report 
        where dt = '{dt}' 

    """.format(dt=ds, ds=ds_nodash)

    headers = [
        'day',
        'area_name',
        #'points',
        'bdm_name',
        'hbdm_name',
        'time_pick',
        'time_peisong',
        'score_peisong',
        'total_cancle',
        'total_auto_cancle',
        'total_merchant_cancle',
        'total_user_cancle'
    ]

    logging.info('Executing: %s', sql)
    cursor.execute(sql)
    rows = cursor.fetchall()

    file_name = '/tmp/ofood_bdm_dim_metrics_{dt}.csv'.format(dt=ds)
    with codecs.open(file_name, 'w', 'utf_8_sig') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(rows)

    # send mail
    email_to = Variable.get("ofood_honour_metrics_receivers").split()
    # email_to = ['*****@*****.**']
    email_subject = 'ofood-BDM履约每日数据_{dt}'.format(dt=ds)
    email_body = 'ofood-BDM履约每日数据'
    send_email(email_to,
               email_subject,
               email_body, [file_name],
               mime_charset='utf-8')
Пример #13
0
def check_bad_debts_data(**op_kwargs):
    dt = op_kwargs.get('ds')
    print(dt)
    cursor = get_hive_cursor()
    cursor.execute("hive.execution.engine=tez")
    repair_table_names = ["data_order", "data_order_payment"]
    for name in repair_table_names:
        print(name)
        cursor.execute(repair_table_query % name)
    build_csv(dt)
    cursor.execute(query1.format(dt=dt))
    res1 = cursor.fetchall()
    user_view_bad_debts = {}
    driver_view_bad_debts = {}
    for line in res1:
        (order_id, user_id, driver_id, price) = line
        price = float(price)
        if user_id not in user_view_bad_debts:
            user_view_bad_debts[user_id] = [set(), 0]
        if order_id not in user_view_bad_debts[user_id][0]:
            user_view_bad_debts[user_id][0].add(order_id)
            user_view_bad_debts[user_id][1] += price
        if driver_id not in driver_view_bad_debts:
            driver_view_bad_debts[driver_id] = [set(), 0]
        if order_id not in driver_view_bad_debts[driver_id][0]:
            driver_view_bad_debts[driver_id][0].add(order_id)
            driver_view_bad_debts[driver_id][1] += price
    user_data, driver_data = [], []
    for uid in user_view_bad_debts:
        user_data.append([
            uid,
            len(user_view_bad_debts[uid][0]), user_view_bad_debts[uid][1]
        ])
    for did in driver_view_bad_debts:
        driver_data.append([
            did,
            len(driver_view_bad_debts[did][0]), driver_view_bad_debts[did][1]
        ])
    # sort according to the amount price
    user_data.sort(key=lambda x: x[2], reverse=True)
    driver_data.sort(key=lambda x: x[2], reverse=True)
    user_titles = ["user_id", "amount of order", "amount of price"]
    driver_titles = ["driver_id", "amount of order", "amount of price"]
    user_data = [user_titles] + user_data[:bad_debt_email_limit]
    driver_data = [driver_titles] + driver_data[:bad_debt_email_limit]
    msg = build_html_txt(user_data, driver_data, dt)
    try:
        server = smtplib.SMTP('mail.opay-inc.com', 25)
        server.ehlo()
        server.starttls()
        server.login(sender, password)
        server.sendmail(sender, receivers, msg.as_string())
        print("邮件发送成功")
    except smtplib.SMTPException as e:
        print(e.message)
Пример #14
0
def get_table_schema(hive_db, hive_table):
    """
        读取hive 表结构
    """

    hive_cursor = get_hive_cursor()
    hql = '''
        DESCRIBE FORMATTED {db}.{table} 
    '''.format(db=hive_db, table=hive_table)
    logging.info(hql)
    hive_cursor.execute(hql)
    res = hive_cursor.fetchall()

    hive_schema = []

    hive_schema_exp = []

    location = None

    for (column_name, column_type, column_comment) in res:

        col_name = column_name.lower().strip()

        column_type = str(column_type).strip()

        if col_name == 'location:':

            location = column_type
            break

        #将空字符串替换给未知
        if column_comment == "" or column_comment == "from deserializer":
            column_comment = "未知"

        if col_name == '# col_name' or col_name == '':

            continue

        if col_name == '# partition information':

            if column_comment is None:
                column_comment = "未知"

            break

        _schema = col_name + " " + column_type + " " + "COMMENT" + " " + column_comment.replace(
            "\\n", "") + "\n"

        hive_schema_exp.append(_schema)

        hive_schema.append(col_name + ",--" + column_comment)

    return hive_schema
def first_user_data(**op_kwargs):
    cursor = get_hive_cursor()
    dt = op_kwargs.get('ds')
    cursor.execute("SET mapreduce.job.queuename=root.airflow")
    cursor.execute("SET hive.exec.parallel=true")
    hql = """
        SELECT 
            uc.code,
            from_unixtime(unix_timestamp(uo.dt,'yyyy-MM-dd'), 'yyyyMMdd') AS day,
            COUNT(DISTINCT uo.user_id) AS u, 
            unix_timestamp() 
        FROM (SELECT 
                user_id,
                get_json_object(event_value, '$.bind_refferal_code') AS code 
            FROM oride_dw.dwd_oride_driver_cheating_detection_hi 
            ) AS uc 
        JOIN (SELECT 
                dt,
                passenger_id as user_id,
                arrive_time,
                row_number() over(partition by passenger_id order by arrive_time) orders
            FROM oride_dw.dwd_oride_order_base_include_test_di 
            WHERE status IN (4,5) AND 
                dt = '{ds}' 
            ) AS uo 
        ON uc.user_id = uo.user_id 
        WHERE uo.orders = 1 and 
            from_unixtime(uo.arrive_time,'yyyy-MM-dd') = '{ds}' 
        GROUP BY uc.code, uo.dt
    """.format(ds=dt)
    logging.info(hql)
    cursor.execute(hql)
    res = cursor.fetchall()
    mconn = get_db_conn('opay_spread_mysql')
    mysql = mconn.cursor()
    sql = 'insert into promoter_data_day (code, day, pft, create_time) values '
    ext = ' on duplicate key update pft=values(pft), create_time=values(create_time)'
    vals = []
    for (c, d, p, t) in res:
        vals.append("('{c}', '{d}', '{p}', '{t}')".format(c=c, d=d, p=p, t=t))
        if len(vals) >= 1000:
            # logging.info(sql + ",".join(vals) + ext)
            mysql.execute(sql + ",".join(vals) + ext)
            vals = []

    if len(vals) > 0:
        # logging.info(sql + ",".join(vals) + ext)
        mysql.execute(sql + ",".join(vals) + ext)

    mysql.close()
    cursor.close()
Пример #16
0
def send_shop_list_file_email(ds, ds_nodash, **kwargs):
    cursor = get_hive_cursor()
    sql = """
        select  
        dt,
        shop_id,
        title,
        bd_name,
        bdm_name,
        hbdm_name,
        his_order_cnt,
        if(closed = 0,'Y','N'),
        if(is_new_user_act = 1,'Y','N'),
        if(is_promotion_act = 1,'Y','N'),
        yy_peitime,
        product_cnt,
        addr,
        account_number
        
        from ofood_bi.ofood_shop_list_metrics_report 
        where dt = '{dt}' 

    """.format(dt=ds, ds=ds_nodash)

    headers = [
        'day', 'shop_id', 'title', 'bd_name', 'bdm_name', 'hbdm_name',
        'his_order_cnt', 'is_open(Y or N)', 'activity_of_new_user(Y or N)',
        'activity_of_promotion(Y or N)', 'business_time', 'menu_item',
        'location', 'opay_account'
    ]

    logging.info('Executing: %s', sql)
    cursor.execute(sql)
    rows = cursor.fetchall()

    file_name = '/tmp/ofood_shop_list_metrics_{dt}.csv'.format(dt=ds)
    with codecs.open(file_name, 'w', 'utf_8_sig') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(rows)

    # send mail
    email_to = Variable.get("ofood_honour_metrics_receivers").split()
    # email_to = ['*****@*****.**']
    email_subject = 'ofood-商家明细List每日数据_{dt}'.format(dt=ds)
    email_body = 'ofood-商家明细List每日数据'
    send_email(email_to,
               email_subject,
               email_body, [file_name],
               mime_charset='utf-8')
Пример #17
0
    def __init__(self):

        self.hive_cursor = get_hive_cursor()

        self.dingding_alert = DingdingAlert(
            'https://oapi.dingtalk.com/robot/send?access_token=928e66bef8d88edc89fe0f0ddd52bfa4dd28bd4b1d24ab4626c804df8878bb48'
        )

        #self.dingding_alert = DingdingAlert_dev('https://oapi.dingtalk.com/robot/send?access_token=c08440c8e569bb38ec358833f9d577b7638af5aaefbd55e3fd748b798fecc4d4')

        self.alert_url = "http://8.208.14.165:8080/admin/airflow/tree?dag_id="

        self.owner_name = None

        self.hdfs_dir_name = None
Пример #18
0
def import_table(**op_kwargs):
    dt = op_kwargs.get('ds')
    env = op_kwargs.get("env")
    print("running date: %s" % dt)
    cursor = get_hive_cursor()
    conf_name = "sqoop_db"
    if env == "test":
        conf_name += "_test"
    host, port, schema, login, password = get_db_conf(conf_name)
    host += ":" + str(port)
    for table in tables:
        print("importing table: %s" % table)
        hive_table = table
        if env == "test":
            hive_table += "_dev"
        os.system(default_command % (sqoop_path, host, schema, login, password, table, hive_table, dt))
        cursor.execute("ALTER TABLE oride_db.%s ADD IF NOT EXISTS PARTITION (dt = '%s')" % (hive_table, dt))
    print("import done")
def base_data(**op_kwargs):
    cursor = get_hive_cursor()
    dt = op_kwargs.get('ds')
    cursor.execute("SET mapreduce.job.queuename=root.airflow")
    cursor.execute("SET hive.exec.parallel=true")
    hql = """
        SELECT
            t.code,
            from_unixtime(unix_timestamp(dt,'yyyy-MM-dd'), 'yyyyMMdd') as day,
            COUNT(DISTINCT t.bind_number) as users_count,
            COUNT(DISTINCT if (length(t.bind_device)>0, t.bind_device, NULL)) as device_count, 
            unix_timestamp() 
        FROM oride_dw.dwd_oride_driver_cheating_detection_hi 
        LATERAL VIEW json_tuple(event_value, 'bind_refferal_code', 'bind_number', 'bind_device_id') t AS code, bind_number, bind_device 
        WHERE dt = '{ds}'
        GROUP BY t.code, dt
    """.format(ds=dt)
    logging.info(hql)
    cursor.execute(hql)
    res = cursor.fetchall()
    mconn = get_db_conn('opay_spread_mysql')
    mysql = mconn.cursor()
    sql = 'insert into promoter_data_day (code, day, users_count, device_count, create_time) values '
    ext = """ on duplicate key update 
        users_count=values(users_count), 
        device_count=values(device_count), 
        create_time=values(create_time)
    """
    vals = []
    for (code, day, users, device, t) in res:
        vals.append("('{code}', '{day}', '{user}', '{d}', '{t}')".format(
            code=code, day=day, user=users, d=device, t=t))
        if len(vals) >= 1000:
            # logging.info(sql + ",".join(vals) + ext)
            mysql.execute(sql + ",".join(vals) + ext)
            vals = []

    if len(vals) > 0:
        # logging.info(sql + ",".join(vals) + ext)
        mysql.execute(sql + ",".join(vals) + ext)

    mysql.close()
    cursor.close()
Пример #20
0
def get_max_week(ds):
    sql = '''
        select 
            max(week) as max_week
        from oride_dw.dwm_oride_passenger_act_w
        where datediff('{pt}',dt)<=90 and datediff('{pt}',dt)>=0
    '''.format(
        pt=airflow.macros.ds_add(ds, +6)
    )

    cursor = get_hive_cursor()
    logging.info('Executing: %s', sql)
    cursor.execute(sql)
    week_list = cursor.fetchall()
    cursor.close
    if len(week_list) > 0:
        for week in week_list:
            max_week = week[0]
    return max_week
Пример #21
0
def build_csv(dt):
    col_name = [
        'a.id', 'a.user_id', 'a.start_name', 'a.end_name', 'a.duration',
        'a.distance', 'a.price', 'a.reward', 'a.driver_id', 'a.take_time',
        'a.wait_time', 'a.pickup_time', 'a.arrive_time', 'a.finish_time',
        'a.cancel_role', 'a.cancel_reason', 'a.cancel_time', 'a.cancel_type',
        'a.status', 'a.dt', 'b.id', 'b.driver_id', 'b.mode', 'b.price',
        'b.coupon_id', 'b.coupon_amount', 'b.amount', 'b.bonus', 'b.balance',
        'b.opay_amount', 'b.reference', 'b.currency', 'b.country',
        'b.zfstatus', 'b.modify_time', 'b.create_time', 'b.dt'
    ]
    cursor = get_hive_cursor()
    cursor.execute(tmp_query.format(dt=dt))
    res = cursor.fetchall()
    with open("/tmp/tainzhi_query_%s.csv" % dt, "w") as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(col_name)
        for elem in res:
            csv_writer.writerow(elem)
    print("tz csv write done")
Пример #22
0
def drop_partions(*op_args, **op_kwargs):
    dt = op_kwargs['ds']
    cursor = get_hive_cursor()
    sql = '''
        show partitions {table_name}
    '''.format(table_name=hive_table)
    cursor.execute(sql)
    res = cursor.fetchall()
    logging.info(res)
    for partition in res:
        prt, = partition
        matched = re.search(r'country_code=(?P<cc>\w+)/dt=(?P<dy>.*)$', prt)
        cc = matched.groupdict().get('cc', 'nal')
        dy = matched.groupdict().get('dy', '')
        if dy == dt:
            hql = '''
                ALTER TABLE {table_name} DROP IF EXISTS PARTITION (country_code='{cc}', dt='{dt}')
            '''.format(cc=cc, dt=dt, table_name=hive_table)
            logging.info(hql)
            cursor.execute(hql)
Пример #23
0
def check_key_data_task(ds):

    cursor = get_hive_cursor()

    #主键重复校验
    check_sql = '''
    SELECT count(1)-count(distinct passenger_id,passenger_number,client_timestamp,
         platform,
         os_version,
         app_name,
         app_version,
         locale,
         device_id,
         device_screen,
         device_model,
         device_manufacturer,
         is_root,
         channel,
         subchannel,
         appsflyer_id) as cnt
      FROM {db}.{table}
      WHERE dt='{pt}'
    '''.format(pt=ds,
               now_day=airflow.macros.ds_add(ds, +1),
               table=table_name,
               db=db_name)
    logging.info('Executing 主键重复校验: %s', check_sql)

    cursor.execute(check_sql)

    res = cursor.fetchone()

    if res[0] > 1:
        flag = 1
        raise Exception("Error The primary key repeat !", res)
        sys.exit(1)
    else:
        flag = 0
        print("-----> Notice Data Export Success ......")

    return flag
Пример #24
0
def order_result_to_mysql(ds, **kwargs):
    cursor = get_hive_cursor()
    logging.info(promoter_orderoverview_hql.format(ds=ds))
    cursor.execute(promoter_orderoverview_hql.format(ds=ds))
    results = cursor.fetchall()
    mysql_conn = get_db_conn('opay_spread_mysql')
    mcursor = mysql_conn.cursor()

    sql_insert = 'INSERT INTO promoter_driver_day (day, name, mobile, code, channel, driver_type, firstbill) VALUES'
    sql_ext = 'ON DUPLICATE KEY UPDATE firstbill = values(firstbill)'
    sql_val = ''
    sql_count = 0
    for day, driver_type, channel, name, mobile, code, first, ten in results:
        sql_tmp = "('{day}', '{name}', '{mobile}',  '{code}', '{channel}', '{driver_type}', '{firstbill}')".format(
            day=day,
            name=name.replace("\\", "").replace("'", "\\'"),
            mobile=mobile if (len(mobile) < 20) else '',
            code=code,
            channel=channel,
            driver_type=driver_type,
            firstbill=(first if driver_type == 2 else 0))

        if sql_val == '':
            sql_val = sql_tmp
        else:
            sql_val += ',' + sql_tmp
        sql_count += 1
        if sql_count >= 1000:
            sql = sql_insert + ' ' + sql_val + ' ' + sql_ext
            mcursor.execute(sql)
            sql_count = 0
            sql_val = ''

    if sql_count > 0:
        sql = sql_insert + ' ' + sql_val + ' ' + sql_ext
        mcursor.execute(sql)

    mysql_conn.commit()
    cursor.close()
    mcursor.close()
    mysql_conn.close()
Пример #25
0
def user_label_to_redis(ds, **kwargs):
    label_list = {
        'lab_new_user': 1,
        'lab_login_without_orders': 2,
        'lab_login_have_orders': 3,
        'lab_cancel_ge_finish': 4
    }
    query = """
        SELECT
          user_id,
          lab_new_user,
          lab_login_without_orders,
          lab_login_have_orders,
          lab_cancel_ge_finish,
          phone_number
        FROM
          dashboard.oride_user_label
        WHERE
          dt='{dt}'
    """.format(dt=ds)
    cursor = get_hive_cursor()
    cursor.execute(query)
    results = cursor.fetchall()
    redis_conn = RedisHook(redis_conn_id='redis_user_lab').get_conn()
    expire_time = 86400
    for user_id, lab_new_user, lab_login_without_orders, lab_login_have_orders, lab_cancel_ge_finish, phone_number in results:
        list = []
        if lab_new_user == True:
            list.append(label_list['lab_new_user'])
        if lab_login_without_orders == True:
            list.append(label_list['lab_login_without_orders'])
        if lab_login_have_orders == True:
            list.append(label_list['lab_login_have_orders'])
        if lab_cancel_ge_finish == True:
            list.append(label_list['lab_cancel_ge_finish'])
        if len(list):
            redis_key = 'user_tag_%s' % phone_number
            redis_conn.set(redis_key, json.dumps(list), ex=expire_time)
            logging.info('user_id:%s, lab_list:%s, key:%s, phone_number:%s' %
                         (user_id, json.dumps(list), redis_key, phone_number))
    cursor.close()
Пример #26
0
def dirver_daily_summary_insert(ds, **kwargs):
    sql = """
        SELECT
                null as id,
                dt,
                driver_id,
                real_name,
                phone_number,
                group_id,
                nvl(group_name, ''),
                nvl(group_leader, ''),
                order_num,
                order_finished_num,
                order_cancel_num,
                online_time,
                duration_total,
                distance_total,
                comment_scores,
                comment_times,
                peak_time_order_num,
                nvl(app_version, '')
            FROM
                dashboard.oride_driver_daily_summary
            WHERE
                dt='{ds}'
    """.format(ds=ds)
    cursor = get_hive_cursor()
    logging.info("run sql, %s", sql)
    cursor.execute(sql)
    results = cursor.fetchall()
    part_size = 1000
    index = 0
    processes = []
    while index < len(results):
        p = Process(target=dirver_daily_summary_process,
                    args=(results[index:index + part_size], index))
        index += part_size
        processes.append(p)
        p.start()
    for p in processes:
        p.join()
Пример #27
0
def hiveresult_to_mysql(ds, **kwargs):
    cursor = get_hive_cursor()
    logging.info(kwargs['sql'].format(ds=ds))
    cursor.execute(kwargs['sql'].format(ds=ds))
    results = cursor.fetchall()
    mysql_conn = get_db_conn('opay_spread_mysql')
    mcursor = mysql_conn.cursor()
    sql_insert = kwargs['sql_insert']
    sql_val = ''
    sql_ext = kwargs['sql_ext']
    sql_count = 0
    for day, driver_type, channel, name, mobile, code, drivers in results:
        sql_tmp = "('{day}', '{name}', '{mobile}',  '{code}', '{channel}', '{driver_type}', '{dirvers}')".format(
            day=day,
            name=name.replace("\\", "").replace("'", "\\'"),
            code=code,
            mobile=mobile if (len(mobile) < 20) else '',
            channel=channel,
            driver_type=driver_type,
            dirvers=drivers)
        if sql_val == '':
            sql_val = sql_tmp
        else:
            sql_val += ',' + sql_tmp
        sql_count += 1
        if sql_count >= 1000:
            sql = sql_insert + ' ' + sql_val + ' ' + sql_ext
            #logging.info(sql)
            mcursor.execute(sql)
            sql_count = 0
            sql_val = ''

    if sql_count > 0:
        sql = sql_insert + ' ' + sql_val + ' ' + sql_ext
        mcursor.execute(sql)

    mysql_conn.commit()
    cursor.close()
    mcursor.close()
    mysql_conn.close()
Пример #28
0
def write_meta_data(table_name, day, result, msg):
    cursor = get_hive_cursor()
    # if not result:
    #     sql = '''
    #         ALTER TABLE oride_db.{table_name} DROP IF EXISTS PARTITION(dt='{day}')
    #     '''.format(
    #         table_name=table_name,
    #         day=day)
    #
    #     cursor.execute(sql)

    sql = '''
        insert into table oride_bi.oride_meta_import_data 
        partition (dt='{day}',table_name='{table_name}')
        select {result},'{msg}','{timestamp}' from default.dual
    '''.format(
        table_name=table_name,
        day=day,
        result=result,
        msg=msg,
        timestamp=now.strftime('%Y-%m-%d %H:%M:%S')
    )

    cursor.execute(sql)
Пример #29
0
def data_volume_monitoring(ds, db_name, table_name,is_valid_success, **op_kwargs):
    cursor = get_hive_cursor()
    sql = """
        SELECT count(1) FROM {db_name}.{table_name} WHERE dt='{dt}'
    """.format(
        db_name=db_name,
        table_name=table_name,
        dt=ds
    )
    logging.info("execute sql:%s", sql)
    cursor.execute(sql)
    res = cursor.fetchone()
    cursor.close()

    row_num = int(res[0])
    logging.info("import data {db}.{table}, row_num:{row_num}".format(db=db_name, table=table_name, row_num=row_num))

    #true: 数据有才生成_SUCCESS false:数据没有也生成_SUCCESS 
    if is_valid_success.lower()=="false":
        row_num=1

    if row_num <= 0:
        comwx.postAppMessage("{db}.{table}数据导入异常".format(db=db_name, table=table_name), '271')
        raise Exception('sqoop导入数据异常')
Пример #30
0
def query_data(**op_kwargs):
    dt = op_kwargs.get('ds')
    cursor = get_hive_cursor()
    cursor.execute("set hive.execution.engine=tez")
    repair_table_names = [
        "data_driver_extend", "data_driver_reward", "data_order",
        "data_order_payment", "data_user_extend", "user_action", "client_event"
    ]
    for name in repair_table_names:
        print(name)
        db_name = "oride_source."
        if name.startswith("data"):
            db_name = "oride_db."
        cursor.execute(repair_table_query % (db_name + name))
    cursor.execute(query1.format(dt=dt))
    res1 = cursor.fetchall()
    res1 = map(mapper, list(res1[0]))
    [
        call_num, success_num, gmv, cancel_before_dispatching_num,
        cancel_after_dispatching_by_user_num,
        cancel_after_dispatching_by_driver_num, pickup_num, pickup_total_time,
        take_num, take_total_time, total_driver_price
    ] = res1
    print(1)
    cursor.execute(query2.format(dt=dt))
    res2 = cursor.fetchall()
    res2 = map(mapper, list(res2[0]))
    [pay_num, total_price, total_c_discount, offline_num] = res2
    print(2)
    cursor.execute(query4.format(dt=dt))
    res4 = cursor.fetchall()
    res4 = map(mapper, list(res4[0]))
    [call_user_num, finished_user_num, new_finished_user_num] = res4
    print(4)
    cursor.execute(query5.format(dt=dt))
    res5 = cursor.fetchall()
    res5 = map(mapper, list(res5[0]))
    [total_driver_num, login_driver_num, new_driver_num] = res5
    print(5)
    cursor.execute(query6.format(dt=dt))
    res6 = cursor.fetchall()
    res6 = map(mapper, list(res6[0]))
    [order_driver_num, finished_driver_num, new_finished_driver_num] = res6
    print(6)
    cursor.execute(query7.format(dt=dt))
    res7 = cursor.fetchall()
    res7 = map(mapper, list(res7[0]))
    [bubble_num] = res7
    print(7)
    cursor.execute(query9.format(dt=dt))
    res9 = cursor.fetchall()
    res9 = map(mapper, list(res9[0]))
    [new_passenger_num] = res9
    print(9)
    (transport_efficiency, avg_order_per_driver,
     online_driver_num) = get_driver_data(dt)
    print(10)
    data = [
        success_num, success_num / float(call_num) if call_num > 0 else 0,
        bubble_num, call_num,
        call_num / float(bubble_num) if bubble_num > 0 else 0,
        online_driver_num, order_driver_num,
        round(float(gmv), 2),
        round(float(gmv) / float(success_num) if success_num > 0 else 0, 2),
        round(float(total_driver_price), 2),
        round(float(total_c_discount), 2),
        round(
            float(total_driver_price) /
            float(success_num) if success_num > 0 else 0, 2),
        round(
            float(total_c_discount) /
            float(success_num) if success_num > 0 else 0, 2),
        float(total_driver_price + total_c_discount) /
        float(total_price) if total_price > 0 else 0,
        cancel_before_dispatching_num / float(call_num) if call_num > 0 else 0,
        cancel_after_dispatching_by_user_num /
        float(call_num) if call_num > 0 else 0,
        cancel_after_dispatching_by_driver_num /
        float(call_num) if call_num > 0 else 0,
        round(
            pickup_total_time /
            float(pickup_num * 60) if pickup_num > 0 else 0, 2),
        round(take_total_time / float(take_num) if take_num > 0 else 0,
              2), total_driver_num, new_driver_num, finished_driver_num,
        new_finished_driver_num, new_finished_driver_num /
        float(finished_driver_num) if finished_driver_num > 0 else 0,
        call_user_num, finished_user_num, new_passenger_num,
        new_finished_user_num, new_finished_user_num / float(finished_user_num)
        if finished_driver_num > 0 else 0, new_finished_user_num /
        new_passenger_num if new_passenger_num > 0 else 0, pay_num -
        offline_num, offline_num, transport_efficiency, 0, avg_order_per_driver
    ]
    insert_data = [None, dt] + data
    sql_conn = get_db_conn()
    sql_cursor = sql_conn.cursor()
    sql_cursor.execute(INSERT_SQL, insert_data)