def not_pay_push(**op_kwargs): dt = op_kwargs.get('ds') env = op_kwargs.get('env', 'prod') lagos_9_clock_timestamp = get_lagos_timestamp(dt) cursor = get_hive_cursor() table_name = 'data_order' table_name2 = 'data_user_whitelist' if env == 'test': table_name += '_dev' table_name2 += '_dev' cursor.execute("msck repair table oride_db.%s" % table_name) cursor.execute("msck repair table oride_db.%s" % table_name2) cursor.execute( not_pay_hql.format(table_name=table_name, table_name2=table_name2, dt=dt)) res = [x[0] for x in cursor.fetchall()] print("not pay order ids: %d" % len(res)) step = 100 db_name = 'sqoop_db' if env == 'test': db_name += '_test' mysql_cursor = get_db_conn(db_name).cursor() uids = set() for i in range(0, len(res), step): tmp = [str(x) for x in res[i:i + step]] sql = not_pay_sql.format(ids=','.join(tmp)) mysql_cursor.execute(sql) data = mysql_cursor.fetchall() for rec in data: uids.add(rec[0]) print("not pay user ids: %d" % len(uids)) print(uids) for uid in uids: send_push(env, 1, uid, lagos_9_clock_timestamp, "not_pay")
def hiveresult_to_channel_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(kwargs['sql'].format(ds=ds)) cursor.execute(kwargs['sql'].format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = kwargs['sql_insert'] sql_val = '' sql_ext = kwargs['sql_ext'] sql_count = 0 for day, channel, driver_type, drivers in results: sql_tmp = "('{day}', '{channel}', '{driver_type}', '{dirvers}')".format( day=day, channel=channel, driver_type=driver_type, dirvers=drivers) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext # logging.info(sql) mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def validate_partition(*op_args, **op_kwargs): cursor = get_hive_cursor() dt = op_kwargs['ds'] table_names = op_kwargs['table_names'] task_name = op_kwargs['task_name'] for table_name in table_names: sql = ''' show partitions {table_name} '''.format( table_name=table_name ) cursor.execute(sql) res = cursor.fetchall() flag = False for partition in res: if str(partition[0]).find(dt) > -1: flag = True break if not flag: comwx.postAppMessage('{table_name} : {dt} 分区不存在 , {task_name} 任务终止执行'.format( table_name=table_name, dt=dt, task_name=task_name ), '271')
def abnormal_push(**op_kwargs): dt = op_kwargs.get('ds') env = op_kwargs.get('env', 'prod') lagos_9_clock_timestamp = get_lagos_timestamp(dt) cursor = get_hive_cursor() table_record = 'data_driver_recharge_records' table_abnormal = 'data_abnormal_order' table_white = 'data_driver_whitelist' if env == 'test': table_record += '_dev' table_abnormal += '_dev' table_white += '_dev' cursor.execute("msck repair table oride_db.%s" % table_record) cursor.execute("msck repair table oride_db.%s" % table_abnormal) cursor.execute("msck repair table oride_db.%s" % table_white) cursor.execute( abnormal_sql.format(table_record=table_record, table_white=table_white, table_abnormal=table_abnormal, dt=dt)) abnormal_drivers = [x[0] for x in cursor.fetchall()] print("abnormal order related drivers: %d" % len(abnormal_drivers)) print(abnormal_drivers) for did in abnormal_drivers: send_push(env, 2, did, lagos_9_clock_timestamp, "deduct")
def get_location(hive_db, hive_table): """ 读取hive 表location """ location = None hive_cursor = get_hive_cursor() hql = ''' DESCRIBE FORMATTED {db}.{table} '''.format(db=hive_db, table=hive_table) #logging.info(hql) hive_cursor.execute(hql) res = hive_cursor.fetchall() for (col_name, col_type, col_comment) in res: col_name = col_name.lower().strip() if col_name == 'location:': location = col_type break if location is None: return None return location
def is_alert(dt, table_names): cursor = get_hive_cursor() template = "'{table_name}'," table_list = '' for table_name in table_names: if str(table_name).find('.') > -1: table_name = str(table_name).split('.')[1] table_list += template.format(table_name=table_name) table_list = table_list[0:len(table_list) - 1] sql = ''' select count(1) from oride_bi.oride_meta_import_data where dt = '{dt}' and table_name in ({table_list}) and is_import = 0 '''.format(dt=dt, table_list=table_list) logging.info(sql) cursor.execute(sql) res = cursor.fetchall() result = int(res[0][0]) return result
def get_country_code(self): cursor = get_hive_cursor() #获取二位国家码 get_sql = ''' select concat_ws(',',collect_set(country_code)) as country_code from {db}.{table} WHERE dt='{pt}' '''.format(pt=self.ds, table=self.table_name, db=self.db_name) cursor.execute(get_sql) res = cursor.fetchone() if len(res[0]) > 1: country_code_list = res[0] logging.info('Executing 二位国家码: %s', country_code_list) else: country_code_list = "nal" logging.info('Executing 二位国家码为空,赋予默认值 %s', country_code_list) return country_code_list
def check_key_data_task(ds): # 主键重复校验 HQL_DQC = ''' SELECT count(1) as nm FROM (SELECT order_id, count(1) as cnt FROM oride_dw.{table} WHERE dt='{pt}' GROUP BY order_id HAVING count(1)>1) t1 '''.format( pt=ds, now_day=airflow.macros.ds_add(ds, +1), table=table_name ) cursor = get_hive_cursor() logging.info('Executing 主键重复校验: %s', HQL_DQC) cursor.execute(HQL_DQC) res = cursor.fetchone() if res[0] > 1: raise Exception("Error The primary key repeat !", res) else: print("-----> Notice Data Export Success ......")
def check_key_data_task(ds): cursor = get_hive_cursor() # 主键重复校验 check_sql = ''' SELECT count(1)-count(distinct (concat(order_id,'_',user_id))) as cnt FROM {db}.{table} WHERE dt='{pt}' and country_code in ('nal') '''.format(pt=ds, now_day=airflow.macros.ds_add(ds, +1), table=table_name, db=db_name) logging.info('Executing 主键重复校验: %s', check_sql) cursor.execute(check_sql) res = cursor.fetchone() if res[0] > 1: flag = 1 raise Exception("Error The primary key repeat !", res) sys.exit(1) else: flag = 0 print("-----> Notice Data Export Success ......") return flag
def csresult_channel_to_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(cssql.format(ds=ds)) cursor.execute(cssql.format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = ''' INSERT INTO promoter_order_day ( dt, driver_id, driver_type, name, mobile, city_id, distance, income, online_paid, online_total, total_orders, arrived_orders, total_comments, bad_comments, total_score, online_time ) VALUES ''' sql_ext = ''' ON DUPLICATE KEY UPDATE ''' sql_val = '' sql_count = 0 for driver_id, dt, name, phone, city, type, distance, income, onlineSettlement, onlineTotal, total_orders, arrived_orders, comment, badcomments_num, score, onlinetime in results: sql_tmp = ''' ('{dt}', '{driver_id}', '{driver_type}', '{name}', '{mobile}', '{city_id}', '{distance}', '{income}', '{online_paid}', '{online_total}', '{total_orders}', '{arrived_orders}', '{total_comments}', '{bad_comments}', '{total_score}', '{online_time}') '''.format(dt=dt, driver_id=driver_id, driver_type=type, name=name.replace("\\", "").replace("'", "\\'"), mobile=phone, city_id=city, distance=distance, income=income, online_paid=onlineSettlement, online_total=onlineTotal, total_orders=total_orders, arrived_orders=arrived_orders, total_comments=comment, bad_comments=badcomments_num, total_score=score, online_time=onlinetime) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def get_data_from_hive(ds, execution_date, **op_kwargs): # ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) hql = ''' SELECT create_date_hour , sub_service_type , state , region , order_status , order_cnt , order_amt, country_code , dt , hour from opay_dw.app_opay_cico_sum_ng_h where country_code = 'NG' and concat(dt,' ',hour) >= date_format(default.localTime("{config}", 'NG', '{v_date}', -1), 'yyyy-MM-dd HH') and concat(dt,' ',hour) <= date_format(default.localTime("{config}", 'NG', '{v_date}', 0), 'yyyy-MM-dd HH') '''.format( pt=ds, v_date=execution_date.strftime("%Y-%m-%d %H:%M:%S"), config=config ) logging.info(hql) hive_cursor = get_hive_cursor() hive_cursor.execute(hql) hive_data = hive_cursor.fetchall() mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() __data_only_mysql( mcursor, execution_date ) __data_to_mysql( mcursor, hive_data, [ 'create_date_hour', 'sub_service_type', 'state', 'region', 'order_status', 'order_cnt', 'order_amt', 'country_code', 'dt', 'hour' ] ) hive_cursor.close() mcursor.close()
def send_bdm_dim_file_email(ds, ds_nodash, **kwargs): cursor = get_hive_cursor() sql = """ select dt, area_name, --points, bdm_name, hbdm_name, take_time_avg, delivery_time_avg, score_peisong_avg, cancel_order_cnt, concat(cast(nvl(round(sys_cancel_order_cnt * 100 / cancel_order_cnt,1),0) as string),'%'), concat(cast(nvl(round(user_cancel_order_cnt * 100/cancel_order_cnt,1),0) as string),'%'), concat(cast(nvl(round(merchant_cancel_order_cnt * 100/cancel_order_cnt,1),0) as string),'%') from ofood_bi.ofood_bdm_area_metrics_report where dt = '{dt}' """.format(dt=ds, ds=ds_nodash) headers = [ 'day', 'area_name', #'points', 'bdm_name', 'hbdm_name', 'time_pick', 'time_peisong', 'score_peisong', 'total_cancle', 'total_auto_cancle', 'total_merchant_cancle', 'total_user_cancle' ] logging.info('Executing: %s', sql) cursor.execute(sql) rows = cursor.fetchall() file_name = '/tmp/ofood_bdm_dim_metrics_{dt}.csv'.format(dt=ds) with codecs.open(file_name, 'w', 'utf_8_sig') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(rows) # send mail email_to = Variable.get("ofood_honour_metrics_receivers").split() # email_to = ['*****@*****.**'] email_subject = 'ofood-BDM履约每日数据_{dt}'.format(dt=ds) email_body = 'ofood-BDM履约每日数据' send_email(email_to, email_subject, email_body, [file_name], mime_charset='utf-8')
def check_bad_debts_data(**op_kwargs): dt = op_kwargs.get('ds') print(dt) cursor = get_hive_cursor() cursor.execute("hive.execution.engine=tez") repair_table_names = ["data_order", "data_order_payment"] for name in repair_table_names: print(name) cursor.execute(repair_table_query % name) build_csv(dt) cursor.execute(query1.format(dt=dt)) res1 = cursor.fetchall() user_view_bad_debts = {} driver_view_bad_debts = {} for line in res1: (order_id, user_id, driver_id, price) = line price = float(price) if user_id not in user_view_bad_debts: user_view_bad_debts[user_id] = [set(), 0] if order_id not in user_view_bad_debts[user_id][0]: user_view_bad_debts[user_id][0].add(order_id) user_view_bad_debts[user_id][1] += price if driver_id not in driver_view_bad_debts: driver_view_bad_debts[driver_id] = [set(), 0] if order_id not in driver_view_bad_debts[driver_id][0]: driver_view_bad_debts[driver_id][0].add(order_id) driver_view_bad_debts[driver_id][1] += price user_data, driver_data = [], [] for uid in user_view_bad_debts: user_data.append([ uid, len(user_view_bad_debts[uid][0]), user_view_bad_debts[uid][1] ]) for did in driver_view_bad_debts: driver_data.append([ did, len(driver_view_bad_debts[did][0]), driver_view_bad_debts[did][1] ]) # sort according to the amount price user_data.sort(key=lambda x: x[2], reverse=True) driver_data.sort(key=lambda x: x[2], reverse=True) user_titles = ["user_id", "amount of order", "amount of price"] driver_titles = ["driver_id", "amount of order", "amount of price"] user_data = [user_titles] + user_data[:bad_debt_email_limit] driver_data = [driver_titles] + driver_data[:bad_debt_email_limit] msg = build_html_txt(user_data, driver_data, dt) try: server = smtplib.SMTP('mail.opay-inc.com', 25) server.ehlo() server.starttls() server.login(sender, password) server.sendmail(sender, receivers, msg.as_string()) print("邮件发送成功") except smtplib.SMTPException as e: print(e.message)
def get_table_schema(hive_db, hive_table): """ 读取hive 表结构 """ hive_cursor = get_hive_cursor() hql = ''' DESCRIBE FORMATTED {db}.{table} '''.format(db=hive_db, table=hive_table) logging.info(hql) hive_cursor.execute(hql) res = hive_cursor.fetchall() hive_schema = [] hive_schema_exp = [] location = None for (column_name, column_type, column_comment) in res: col_name = column_name.lower().strip() column_type = str(column_type).strip() if col_name == 'location:': location = column_type break #将空字符串替换给未知 if column_comment == "" or column_comment == "from deserializer": column_comment = "未知" if col_name == '# col_name' or col_name == '': continue if col_name == '# partition information': if column_comment is None: column_comment = "未知" break _schema = col_name + " " + column_type + " " + "COMMENT" + " " + column_comment.replace( "\\n", "") + "\n" hive_schema_exp.append(_schema) hive_schema.append(col_name + ",--" + column_comment) return hive_schema
def first_user_data(**op_kwargs): cursor = get_hive_cursor() dt = op_kwargs.get('ds') cursor.execute("SET mapreduce.job.queuename=root.airflow") cursor.execute("SET hive.exec.parallel=true") hql = """ SELECT uc.code, from_unixtime(unix_timestamp(uo.dt,'yyyy-MM-dd'), 'yyyyMMdd') AS day, COUNT(DISTINCT uo.user_id) AS u, unix_timestamp() FROM (SELECT user_id, get_json_object(event_value, '$.bind_refferal_code') AS code FROM oride_dw.dwd_oride_driver_cheating_detection_hi ) AS uc JOIN (SELECT dt, passenger_id as user_id, arrive_time, row_number() over(partition by passenger_id order by arrive_time) orders FROM oride_dw.dwd_oride_order_base_include_test_di WHERE status IN (4,5) AND dt = '{ds}' ) AS uo ON uc.user_id = uo.user_id WHERE uo.orders = 1 and from_unixtime(uo.arrive_time,'yyyy-MM-dd') = '{ds}' GROUP BY uc.code, uo.dt """.format(ds=dt) logging.info(hql) cursor.execute(hql) res = cursor.fetchall() mconn = get_db_conn('opay_spread_mysql') mysql = mconn.cursor() sql = 'insert into promoter_data_day (code, day, pft, create_time) values ' ext = ' on duplicate key update pft=values(pft), create_time=values(create_time)' vals = [] for (c, d, p, t) in res: vals.append("('{c}', '{d}', '{p}', '{t}')".format(c=c, d=d, p=p, t=t)) if len(vals) >= 1000: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) vals = [] if len(vals) > 0: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) mysql.close() cursor.close()
def send_shop_list_file_email(ds, ds_nodash, **kwargs): cursor = get_hive_cursor() sql = """ select dt, shop_id, title, bd_name, bdm_name, hbdm_name, his_order_cnt, if(closed = 0,'Y','N'), if(is_new_user_act = 1,'Y','N'), if(is_promotion_act = 1,'Y','N'), yy_peitime, product_cnt, addr, account_number from ofood_bi.ofood_shop_list_metrics_report where dt = '{dt}' """.format(dt=ds, ds=ds_nodash) headers = [ 'day', 'shop_id', 'title', 'bd_name', 'bdm_name', 'hbdm_name', 'his_order_cnt', 'is_open(Y or N)', 'activity_of_new_user(Y or N)', 'activity_of_promotion(Y or N)', 'business_time', 'menu_item', 'location', 'opay_account' ] logging.info('Executing: %s', sql) cursor.execute(sql) rows = cursor.fetchall() file_name = '/tmp/ofood_shop_list_metrics_{dt}.csv'.format(dt=ds) with codecs.open(file_name, 'w', 'utf_8_sig') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(rows) # send mail email_to = Variable.get("ofood_honour_metrics_receivers").split() # email_to = ['*****@*****.**'] email_subject = 'ofood-商家明细List每日数据_{dt}'.format(dt=ds) email_body = 'ofood-商家明细List每日数据' send_email(email_to, email_subject, email_body, [file_name], mime_charset='utf-8')
def __init__(self): self.hive_cursor = get_hive_cursor() self.dingding_alert = DingdingAlert( 'https://oapi.dingtalk.com/robot/send?access_token=928e66bef8d88edc89fe0f0ddd52bfa4dd28bd4b1d24ab4626c804df8878bb48' ) #self.dingding_alert = DingdingAlert_dev('https://oapi.dingtalk.com/robot/send?access_token=c08440c8e569bb38ec358833f9d577b7638af5aaefbd55e3fd748b798fecc4d4') self.alert_url = "http://8.208.14.165:8080/admin/airflow/tree?dag_id=" self.owner_name = None self.hdfs_dir_name = None
def import_table(**op_kwargs): dt = op_kwargs.get('ds') env = op_kwargs.get("env") print("running date: %s" % dt) cursor = get_hive_cursor() conf_name = "sqoop_db" if env == "test": conf_name += "_test" host, port, schema, login, password = get_db_conf(conf_name) host += ":" + str(port) for table in tables: print("importing table: %s" % table) hive_table = table if env == "test": hive_table += "_dev" os.system(default_command % (sqoop_path, host, schema, login, password, table, hive_table, dt)) cursor.execute("ALTER TABLE oride_db.%s ADD IF NOT EXISTS PARTITION (dt = '%s')" % (hive_table, dt)) print("import done")
def base_data(**op_kwargs): cursor = get_hive_cursor() dt = op_kwargs.get('ds') cursor.execute("SET mapreduce.job.queuename=root.airflow") cursor.execute("SET hive.exec.parallel=true") hql = """ SELECT t.code, from_unixtime(unix_timestamp(dt,'yyyy-MM-dd'), 'yyyyMMdd') as day, COUNT(DISTINCT t.bind_number) as users_count, COUNT(DISTINCT if (length(t.bind_device)>0, t.bind_device, NULL)) as device_count, unix_timestamp() FROM oride_dw.dwd_oride_driver_cheating_detection_hi LATERAL VIEW json_tuple(event_value, 'bind_refferal_code', 'bind_number', 'bind_device_id') t AS code, bind_number, bind_device WHERE dt = '{ds}' GROUP BY t.code, dt """.format(ds=dt) logging.info(hql) cursor.execute(hql) res = cursor.fetchall() mconn = get_db_conn('opay_spread_mysql') mysql = mconn.cursor() sql = 'insert into promoter_data_day (code, day, users_count, device_count, create_time) values ' ext = """ on duplicate key update users_count=values(users_count), device_count=values(device_count), create_time=values(create_time) """ vals = [] for (code, day, users, device, t) in res: vals.append("('{code}', '{day}', '{user}', '{d}', '{t}')".format( code=code, day=day, user=users, d=device, t=t)) if len(vals) >= 1000: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) vals = [] if len(vals) > 0: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) mysql.close() cursor.close()
def get_max_week(ds): sql = ''' select max(week) as max_week from oride_dw.dwm_oride_passenger_act_w where datediff('{pt}',dt)<=90 and datediff('{pt}',dt)>=0 '''.format( pt=airflow.macros.ds_add(ds, +6) ) cursor = get_hive_cursor() logging.info('Executing: %s', sql) cursor.execute(sql) week_list = cursor.fetchall() cursor.close if len(week_list) > 0: for week in week_list: max_week = week[0] return max_week
def build_csv(dt): col_name = [ 'a.id', 'a.user_id', 'a.start_name', 'a.end_name', 'a.duration', 'a.distance', 'a.price', 'a.reward', 'a.driver_id', 'a.take_time', 'a.wait_time', 'a.pickup_time', 'a.arrive_time', 'a.finish_time', 'a.cancel_role', 'a.cancel_reason', 'a.cancel_time', 'a.cancel_type', 'a.status', 'a.dt', 'b.id', 'b.driver_id', 'b.mode', 'b.price', 'b.coupon_id', 'b.coupon_amount', 'b.amount', 'b.bonus', 'b.balance', 'b.opay_amount', 'b.reference', 'b.currency', 'b.country', 'b.zfstatus', 'b.modify_time', 'b.create_time', 'b.dt' ] cursor = get_hive_cursor() cursor.execute(tmp_query.format(dt=dt)) res = cursor.fetchall() with open("/tmp/tainzhi_query_%s.csv" % dt, "w") as f: csv_writer = csv.writer(f) csv_writer.writerow(col_name) for elem in res: csv_writer.writerow(elem) print("tz csv write done")
def drop_partions(*op_args, **op_kwargs): dt = op_kwargs['ds'] cursor = get_hive_cursor() sql = ''' show partitions {table_name} '''.format(table_name=hive_table) cursor.execute(sql) res = cursor.fetchall() logging.info(res) for partition in res: prt, = partition matched = re.search(r'country_code=(?P<cc>\w+)/dt=(?P<dy>.*)$', prt) cc = matched.groupdict().get('cc', 'nal') dy = matched.groupdict().get('dy', '') if dy == dt: hql = ''' ALTER TABLE {table_name} DROP IF EXISTS PARTITION (country_code='{cc}', dt='{dt}') '''.format(cc=cc, dt=dt, table_name=hive_table) logging.info(hql) cursor.execute(hql)
def check_key_data_task(ds): cursor = get_hive_cursor() #主键重复校验 check_sql = ''' SELECT count(1)-count(distinct passenger_id,passenger_number,client_timestamp, platform, os_version, app_name, app_version, locale, device_id, device_screen, device_model, device_manufacturer, is_root, channel, subchannel, appsflyer_id) as cnt FROM {db}.{table} WHERE dt='{pt}' '''.format(pt=ds, now_day=airflow.macros.ds_add(ds, +1), table=table_name, db=db_name) logging.info('Executing 主键重复校验: %s', check_sql) cursor.execute(check_sql) res = cursor.fetchone() if res[0] > 1: flag = 1 raise Exception("Error The primary key repeat !", res) sys.exit(1) else: flag = 0 print("-----> Notice Data Export Success ......") return flag
def order_result_to_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(promoter_orderoverview_hql.format(ds=ds)) cursor.execute(promoter_orderoverview_hql.format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = 'INSERT INTO promoter_driver_day (day, name, mobile, code, channel, driver_type, firstbill) VALUES' sql_ext = 'ON DUPLICATE KEY UPDATE firstbill = values(firstbill)' sql_val = '' sql_count = 0 for day, driver_type, channel, name, mobile, code, first, ten in results: sql_tmp = "('{day}', '{name}', '{mobile}', '{code}', '{channel}', '{driver_type}', '{firstbill}')".format( day=day, name=name.replace("\\", "").replace("'", "\\'"), mobile=mobile if (len(mobile) < 20) else '', code=code, channel=channel, driver_type=driver_type, firstbill=(first if driver_type == 2 else 0)) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def user_label_to_redis(ds, **kwargs): label_list = { 'lab_new_user': 1, 'lab_login_without_orders': 2, 'lab_login_have_orders': 3, 'lab_cancel_ge_finish': 4 } query = """ SELECT user_id, lab_new_user, lab_login_without_orders, lab_login_have_orders, lab_cancel_ge_finish, phone_number FROM dashboard.oride_user_label WHERE dt='{dt}' """.format(dt=ds) cursor = get_hive_cursor() cursor.execute(query) results = cursor.fetchall() redis_conn = RedisHook(redis_conn_id='redis_user_lab').get_conn() expire_time = 86400 for user_id, lab_new_user, lab_login_without_orders, lab_login_have_orders, lab_cancel_ge_finish, phone_number in results: list = [] if lab_new_user == True: list.append(label_list['lab_new_user']) if lab_login_without_orders == True: list.append(label_list['lab_login_without_orders']) if lab_login_have_orders == True: list.append(label_list['lab_login_have_orders']) if lab_cancel_ge_finish == True: list.append(label_list['lab_cancel_ge_finish']) if len(list): redis_key = 'user_tag_%s' % phone_number redis_conn.set(redis_key, json.dumps(list), ex=expire_time) logging.info('user_id:%s, lab_list:%s, key:%s, phone_number:%s' % (user_id, json.dumps(list), redis_key, phone_number)) cursor.close()
def dirver_daily_summary_insert(ds, **kwargs): sql = """ SELECT null as id, dt, driver_id, real_name, phone_number, group_id, nvl(group_name, ''), nvl(group_leader, ''), order_num, order_finished_num, order_cancel_num, online_time, duration_total, distance_total, comment_scores, comment_times, peak_time_order_num, nvl(app_version, '') FROM dashboard.oride_driver_daily_summary WHERE dt='{ds}' """.format(ds=ds) cursor = get_hive_cursor() logging.info("run sql, %s", sql) cursor.execute(sql) results = cursor.fetchall() part_size = 1000 index = 0 processes = [] while index < len(results): p = Process(target=dirver_daily_summary_process, args=(results[index:index + part_size], index)) index += part_size processes.append(p) p.start() for p in processes: p.join()
def hiveresult_to_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(kwargs['sql'].format(ds=ds)) cursor.execute(kwargs['sql'].format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = kwargs['sql_insert'] sql_val = '' sql_ext = kwargs['sql_ext'] sql_count = 0 for day, driver_type, channel, name, mobile, code, drivers in results: sql_tmp = "('{day}', '{name}', '{mobile}', '{code}', '{channel}', '{driver_type}', '{dirvers}')".format( day=day, name=name.replace("\\", "").replace("'", "\\'"), code=code, mobile=mobile if (len(mobile) < 20) else '', channel=channel, driver_type=driver_type, dirvers=drivers) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext #logging.info(sql) mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def write_meta_data(table_name, day, result, msg): cursor = get_hive_cursor() # if not result: # sql = ''' # ALTER TABLE oride_db.{table_name} DROP IF EXISTS PARTITION(dt='{day}') # '''.format( # table_name=table_name, # day=day) # # cursor.execute(sql) sql = ''' insert into table oride_bi.oride_meta_import_data partition (dt='{day}',table_name='{table_name}') select {result},'{msg}','{timestamp}' from default.dual '''.format( table_name=table_name, day=day, result=result, msg=msg, timestamp=now.strftime('%Y-%m-%d %H:%M:%S') ) cursor.execute(sql)
def data_volume_monitoring(ds, db_name, table_name,is_valid_success, **op_kwargs): cursor = get_hive_cursor() sql = """ SELECT count(1) FROM {db_name}.{table_name} WHERE dt='{dt}' """.format( db_name=db_name, table_name=table_name, dt=ds ) logging.info("execute sql:%s", sql) cursor.execute(sql) res = cursor.fetchone() cursor.close() row_num = int(res[0]) logging.info("import data {db}.{table}, row_num:{row_num}".format(db=db_name, table=table_name, row_num=row_num)) #true: 数据有才生成_SUCCESS false:数据没有也生成_SUCCESS if is_valid_success.lower()=="false": row_num=1 if row_num <= 0: comwx.postAppMessage("{db}.{table}数据导入异常".format(db=db_name, table=table_name), '271') raise Exception('sqoop导入数据异常')
def query_data(**op_kwargs): dt = op_kwargs.get('ds') cursor = get_hive_cursor() cursor.execute("set hive.execution.engine=tez") repair_table_names = [ "data_driver_extend", "data_driver_reward", "data_order", "data_order_payment", "data_user_extend", "user_action", "client_event" ] for name in repair_table_names: print(name) db_name = "oride_source." if name.startswith("data"): db_name = "oride_db." cursor.execute(repair_table_query % (db_name + name)) cursor.execute(query1.format(dt=dt)) res1 = cursor.fetchall() res1 = map(mapper, list(res1[0])) [ call_num, success_num, gmv, cancel_before_dispatching_num, cancel_after_dispatching_by_user_num, cancel_after_dispatching_by_driver_num, pickup_num, pickup_total_time, take_num, take_total_time, total_driver_price ] = res1 print(1) cursor.execute(query2.format(dt=dt)) res2 = cursor.fetchall() res2 = map(mapper, list(res2[0])) [pay_num, total_price, total_c_discount, offline_num] = res2 print(2) cursor.execute(query4.format(dt=dt)) res4 = cursor.fetchall() res4 = map(mapper, list(res4[0])) [call_user_num, finished_user_num, new_finished_user_num] = res4 print(4) cursor.execute(query5.format(dt=dt)) res5 = cursor.fetchall() res5 = map(mapper, list(res5[0])) [total_driver_num, login_driver_num, new_driver_num] = res5 print(5) cursor.execute(query6.format(dt=dt)) res6 = cursor.fetchall() res6 = map(mapper, list(res6[0])) [order_driver_num, finished_driver_num, new_finished_driver_num] = res6 print(6) cursor.execute(query7.format(dt=dt)) res7 = cursor.fetchall() res7 = map(mapper, list(res7[0])) [bubble_num] = res7 print(7) cursor.execute(query9.format(dt=dt)) res9 = cursor.fetchall() res9 = map(mapper, list(res9[0])) [new_passenger_num] = res9 print(9) (transport_efficiency, avg_order_per_driver, online_driver_num) = get_driver_data(dt) print(10) data = [ success_num, success_num / float(call_num) if call_num > 0 else 0, bubble_num, call_num, call_num / float(bubble_num) if bubble_num > 0 else 0, online_driver_num, order_driver_num, round(float(gmv), 2), round(float(gmv) / float(success_num) if success_num > 0 else 0, 2), round(float(total_driver_price), 2), round(float(total_c_discount), 2), round( float(total_driver_price) / float(success_num) if success_num > 0 else 0, 2), round( float(total_c_discount) / float(success_num) if success_num > 0 else 0, 2), float(total_driver_price + total_c_discount) / float(total_price) if total_price > 0 else 0, cancel_before_dispatching_num / float(call_num) if call_num > 0 else 0, cancel_after_dispatching_by_user_num / float(call_num) if call_num > 0 else 0, cancel_after_dispatching_by_driver_num / float(call_num) if call_num > 0 else 0, round( pickup_total_time / float(pickup_num * 60) if pickup_num > 0 else 0, 2), round(take_total_time / float(take_num) if take_num > 0 else 0, 2), total_driver_num, new_driver_num, finished_driver_num, new_finished_driver_num, new_finished_driver_num / float(finished_driver_num) if finished_driver_num > 0 else 0, call_user_num, finished_user_num, new_passenger_num, new_finished_user_num, new_finished_user_num / float(finished_user_num) if finished_driver_num > 0 else 0, new_finished_user_num / new_passenger_num if new_passenger_num > 0 else 0, pay_num - offline_num, offline_num, transport_efficiency, 0, avg_order_per_driver ] insert_data = [None, dt] + data sql_conn = get_db_conn() sql_cursor = sql_conn.cursor() sql_cursor.execute(INSERT_SQL, insert_data)