def demo3(): # query_finance_category_signs(unusual_id='26', category_classify='01') # record = query_finance_shell_daily(unusual_point='13',task_status='doing') # records = query_billds_finance_all_targets(unusual_id='14') # print(records) province_service = ProvinceService() # records = province_service.query_province_names(grade='1') # print(records) origin_name = '东营区' origin_province = province_service.query_belong_province( area_name=origin_name) print(f'origin_province={origin_province}')
cd /you_filed_algos/app PYTHONIOENCODING=utf-8 nohup /root/anaconda3/bin/python /you_filed_algos/app/report/works/full_add/exec_meeting_linshi_data.py & """ log = get_logger(__name__) dest_dir = '/you_filed_algos/prod_kudu_data/temp' dest_file = dest_dir + "/meeting_data.txt" upload_hdfs_path = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_meeting_linshi_analysis/meeting_data.txt' match_area = MatchArea() province_service = ProvinceService() finance_service = FinanceAdministrationService() def refresh_linshi_table(): sql = 'REFRESH 02_logical_layer_007_h_lf_cw.finance_meeting_linshi_analysis' prod_execute_sql(conn_type=CONN_TYPE, sqltype='insert', sql=sql) def init_file(): if not os.path.exists(dest_dir): os.makedirs(dest_dir) if os.path.exists(dest_file): os.remove(dest_file)
def __init__(self): self.province_service = ProvinceService()
class Check13Service: def __init__(self): self.province_service = ProvinceService() def init_file(self): if not os.path.exists(dest_dir): os.makedirs(dest_dir) if os.path.exists(dest_file): os.remove(dest_file) os.mknod(dest_file) def save_fee_data(self): """ 查询超过标准报销费用的住宿费记录 :return: """ self.init_file() sql = f""" SELECT a.bill_id, a.city_name, a.city_grade_name, a.emp_name, a.stand_amount_perday, a.hotel_amount_perday FROM ( SELECT DISTINCT bill_id, city_name, city_grade_name, emp_name,ROUND(stand_amount, 2) as stand_amount_perday, ROUND(hotel_amount/hotel_num , 2) as hotel_amount_perday FROM 01_datamart_layer_007_h_cw_df.finance_rma_travel_accomm WHERE exp_type_name="差旅费" AND hotel_num > 0 AND bill_id is not null AND bill_id != "" )as a {test_limit_cond} """ count_sql = "select count(b.bill_id) from ({sql}) b".format(sql=sql) log.info(count_sql) records = prod_execute_sql(conn_type=CONN_TYPE, sqltype="select", sql=count_sql) count_records = records[0][0] print(f"* count_records ==> {count_records}") max_size = 10 * 10000 limit_size = 10000 select_sql_ls = [] if count_records >= max_size: offset_size = 0 while offset_size <= count_records: if offset_size + limit_size > count_records: limit_size = count_records - offset_size tmp_sql = """ SELECT a.bill_id, a.city_name, a.city_grade_name, a.emp_name,a.stand_amount_perday, a.hotel_amount_perday FROM ( SELECT DISTINCT bill_id, city_name, city_grade_name, emp_name,ROUND(stand_amount, 2) as stand_amount_perday, ROUND(hotel_amount/hotel_num , 2) as hotel_amount_perday FROM 01_datamart_layer_007_h_cw_df.finance_rma_travel_accomm WHERE exp_type_name="差旅费" AND hotel_num > 0 AND bill_id is not null AND bill_id != "" )as a order by stand_amount_perday limit {limit_size} offset {offset_size} """.format(limit_size=limit_size, offset_size=offset_size) select_sql_ls.append(tmp_sql) break else: tmp_sql = """ SELECT a.bill_id, a.city_name, a.city_grade_name, a.emp_name,a.stand_amount_perday, a.hotel_amount_perday FROM ( SELECT DISTINCT bill_id, city_name, city_grade_name, emp_name,ROUND(stand_amount, 2) as stand_amount_perday, ROUND(hotel_amount/hotel_num , 2) as hotel_amount_perday FROM 01_datamart_layer_007_h_cw_df.finance_rma_travel_accomm WHERE exp_type_name="差旅费" AND hotel_num > 0 AND bill_id is not null AND bill_id != "" )as a order by stand_amount_perday limit {limit_size} offset {offset_size} """.format(limit_size=limit_size, offset_size=offset_size) select_sql_ls.append(tmp_sql) offset_size = offset_size + limit_size else: tmp_sql = f""" SELECT a.bill_id, a.city_name, a.city_grade_name, a.emp_name,a.stand_amount_perday, a.hotel_amount_perday FROM ( SELECT DISTINCT bill_id, city_name, city_grade_name, emp_name,ROUND(stand_amount, 2) as stand_amount_perday, ROUND(hotel_amount/hotel_num , 2) as hotel_amount_perday FROM 01_datamart_layer_007_h_cw_df.finance_rma_travel_accomm WHERE exp_type_name="差旅费" AND hotel_num > 0 AND bill_id is not null AND bill_id != "" )as a WHERE a.hotel_amount_perday > a.stand_amount_perday {test_limit_cond} """ select_sql_ls.append(tmp_sql) log.info(f"* 开始分页查询,一共 {len(select_sql_ls)} 页") start_time = time.perf_counter() # threadPool = ThreadPoolExecutor(max_workers=10, thread_name_prefix="thr") # all_task = [threadPool.submit(self.exec_task, (sel_sql)) for sel_sql in select_sql_ls] # wait(all_task, return_when=ALL_COMPLETED) # threadPool.shutdown(wait=True) pool = Pool(30) results = [] for sel_sql in select_sql_ls: rst = pool.spawn(self.exec_task, sel_sql) results.append(rst) gevent.joinall(results) consumed_time = round(time.perf_counter() - start_time) log.info(f"* 一共有数据 {count_records} 条,保存数据耗时 {consumed_time} sec") def exec_task(self, sql): # log.info(sql) records = prod_execute_sql(conn_type=CONN_TYPE, sqltype="select", sql=sql) if records and len(records) > 0: for idx, record in enumerate(records): bill_id = str(record[0]) # bill_id city_name = str(record[1]) # 出差城市名称 city_grade_name = str(record[2]) # 出差城市等级 emp_name = str(record[3]) # 员工名字 stand_amount_perday = float(record[4]) # 每天标准住宿费用 hotel_amount_perday = float(record[5]) # 每天实际花费的住宿费用 province = self.province_service.query_belong_province( city_name) # 出差城市所属的省 # province = province if province else "******" # 可能需要补充 01_datamart_layer_007_h_cw_df.finance_province_city 表中的数据 # city_name = city_name.replace(",", " ") # emp_name = emp_name.replace(",", " ") if ',' in city_name: city_name = city_name.replace(",", " ") if ',' in emp_name: emp_name = emp_name.replace(",", " ") record_str = f"{bill_id},{city_name},{province},{city_grade_name},{emp_name},{stand_amount_perday},{hotel_amount_perday}" # log.info(f"checkpoint_13 {threading.current_thread().name} is running ") # log.info(record_str) with open(dest_file, "a+", encoding="utf-8") as file: file.write(record_str + "\n") def cal_abnormal_data(self, rd_df, coefficient, query_province): # query_province = "江苏省" abnormal_rd_df1 = rd_df[ (rd_df["province"] == query_province) & (rd_df["stand_amount_perday"] >= rd_df["hotel_amount_perday"]) & (rd_df["consume_amount_perday"] > 0)] # print("part1 after filter abnormal_rd_df1 ", len(abnormal_rd_df1)) # print(abnormal_rd_df1.head(10)) # part1 过滤查询 # temp = abnormal_rd_df1.describe()[["consume_amount_perday"]] # std_val = temp.at["std", "consume_amount_perday"] # 标准差 # mean_val = temp.at["mean", "consume_amount_perday"] # 平均值 std_val = abnormal_rd_df1.std().at['consume_amount_perday'] # 标准差 mean_val = abnormal_rd_df1.mean().at['consume_amount_perday'] # 平均值 # 数据的正常范围为 【mean - 2 × std , mean + 2 × std】 max_val = mean_val + coefficient * std_val min_val = mean_val - coefficient * std_val # print(f"{query_province} 方差 => {std_val}, 平均值 => {mean_val}, max_val => {max_val}, min_val => {min_val}") bill_id_ls1 = [] for index, row in abnormal_rd_df1.iterrows(): hotel_amount_perday = row["hotel_amount_perday"] # print("hotel_amount_perday=", hotel_amount_perday) if hotel_amount_perday > max_val or hotel_amount_perday < min_val: bill_id = row["bill_id"] bill_id_ls1.append(bill_id) # part2 过滤查询 abnormal_rd_df2 = rd_df[(rd_df["province"] == query_province) & ( rd_df["stand_amount_perday"] < rd_df["hotel_amount_perday"])] bill_id_ls2 = abnormal_rd_df2["bill_id"].tolist() # 对两个列表合并并去重 rst_ls = list(set(bill_id_ls1 + bill_id_ls2)) log.info(f"checkpoint_13 {query_province} 异常数量={len(rst_ls)}") return rst_ls def analyze_data(self, coefficient=2): log.info(f"* 开始执行检查点13 coefficient={coefficient} ") start_time = time.perf_counter() rd_df = pd.read_csv(dest_file, sep=",", header=None, names=[ "bill_id", "city_name", "province", "city_grade_name", "emp_name", "stand_amount_perday", "hotel_amount_perday" ]) rd_df["consume_amount_perday"] = rd_df["stand_amount_perday"] - rd_df[ "hotel_amount_perday"] # print(rd_df.dtypes) province_names = self.province_service.query_province_names(grade="1") # print(province_names) bill_id_ls = [] for idx, province_name in enumerate(province_names): # print(province_name) tmp_ls = self.cal_abnormal_data(rd_df=rd_df, coefficient=coefficient, query_province=province_name) # bill_id_ls.extend(tmp_ls) exec_sql(tmp_ls) log.info(f"* all results => {len(bill_id_ls)}") # targes_bill_id_ls = query_billds_finance_all_targets(unusual_id="13") # bill_id_ls = [x for x in bill_id_ls if x not in targes_bill_id_ls] # exec_sql(bill_id_ls) consumed_time = round(time.perf_counter() - start_time) log.info(f"* 执行检查点13的数据共耗时 {consumed_time} sec")