def main1(): prod_hdfs = Prod_HDFSTools(conn_type='prod') # 递归下载 HDFS 上的文件夹里的文件 # /user/hive/warehouse/03_basal_layer_hp9clnt200.db/ZTRPT_DWZD # /user/hive/warehouse/03_basal_layer_zfybxers00.db/zfybxers00_z_rma_travel_journey_m # /user/hive/warehouse/02_logical_layer_001_o_lf_cw.db/BIC/AOCCW01012 hdfsDirUrl = 'hdfs:///user/hive/warehouse/02_logical_layer_001_o_lf_cw.db/BIC/AOCCW01012' localDirUrl = '/my_filed_algos/prod_kudu_data/' print('* part1 ') hdfsFileUrl_ls = prod_hdfs.downLoadDir_recursion(hdfsDirUrl=hdfsDirUrl, localDirUrl=localDirUrl) print('* part2 ') print('*** 需要处理HDFS文件数 ==> ', len(hdfsFileUrl_ls)) if os.path.exists(localDirUrl + 'user'): shutil.rmtree(localDirUrl + 'user') test_hdfs = Test_HDFSTools(conn_type='test') print() print('* part3 ') x_all = datetime.now() for index, hdfs_file_url in enumerate(hdfsFileUrl_ls): hdfs_file_url = str(hdfs_file_url) print( f'处理HDFS文件 {len(hdfsFileUrl_ls)} , hdfsFileUrl_ls index => {index}' ) print('prod hdfs_file_url => ', hdfs_file_url) local_file_name = hdfs_file_url.replace('hdfs://nameservice1/', localDirUrl) print('local_file_name => ', local_file_name) hdfs_file_url = hdfs_file_url.replace('hdfs://nameservice1/user', 'hdfs:///user') print('test hdfs_file_url => ', hdfs_file_url) time.sleep(0.01) x = datetime.now() prod_hdfs.downLoadFile2(hdfs_file_url, local_file_name) print('*** 下载一个操作HDFS文件共耗时' + str(datetime.now() - x)) # 上传路径变成小写 hdfs_file_url = hdfs_file_url.lower() time.sleep(0.01) x = datetime.now() test_hdfs.uploadFile2(hdfsDirPath=hdfs_file_url, localPath=local_file_name) if os.path.exists(local_file_name): os.remove(local_file_name) print(f'delete file {local_file_name}') print('*** 上传一个操作HDFS文件共耗时' + str(datetime.now() - x)) print('') print('共耗时' + str(datetime.now() - x_all)) print('--- ok , completed work ---') prod_hdfs.shutdownJVM()
def main(): # 一共 66880 , 消耗时间 1443 sec check_meeting_data() test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file) refresh_linshi_table() init_file() print('--- ok ---')
def upload_hdfs_file(): test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) for year in [ '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021' ]: dest_file = get_dest_file2(year) upload_hdfs_path = get_upload_hdfs_path(year) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)
def main(): """ 处理 22304 条记录,操作共耗时 12 sec """ check_car_linshi_data() test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file) refresh_linshi_table() init_file() print('--- ok ---')
def main(): """ """ check_linshi_office_data() test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file) refresh_linshi_table() #init_file() print('--- 办公费临时表数据已经跑完数据了,ok ---')
def check_prod_hdfs(): test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) travel_url = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_travel_linshi_analysis' office_url = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_offical_linshi_analysis' car_url = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_car_linshi_analysis/car_data.txt' test_hdfs.ls(travel_url)
def upload_file(): test_hdfs = Test_HDFSTools(conn_type='test') test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)
def check_linshi_meeting_data(query_date=query_date): init_file() columns_ls = [ 'finance_meeting_id', 'meet_addr', 'sales_name', 'sales_addressphone', 'sales_bank', 'sales_taxno', 'invo_code' ] columns_str = ",".join(columns_ls) sql = """ select {columns_str} from 01_datamart_layer_007_h_cw_df.finance_meeting_bill where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null) AND pstng_date >= '{query_date}' """.format(columns_str=columns_str, query_date=query_date) # log.info(sql) count_sql = 'select count(a.finance_meeting_id) from ({sql}) a'.format( sql=sql) log.info(count_sql) records = prod_execute_sql(conn_type=CONN_TYPE, sqltype='select', sql=count_sql) count_records = int(records[0][0]) log.info(f'* count_records ==> {count_records}') max_size = 2 * 10000 limit_size = 10000 select_sql_ls = [] if count_records >= max_size: offset_size = 0 while offset_size <= count_records: if offset_size + limit_size > count_records: limit_size = count_records - offset_size tmp_sql = """ select {columns_str} from 01_datamart_layer_007_h_cw_df.finance_meeting_bill where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null) AND pstng_date >= '{query_date}' order by finance_meeting_id limit {limit_size} offset {offset_size} """.format(columns_str=columns_str, limit_size=limit_size, offset_size=offset_size, query_date=query_date) select_sql_ls.append(tmp_sql) break else: tmp_sql = """ select {columns_str} from 01_datamart_layer_007_h_cw_df.finance_meeting_bill where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null ) AND pstng_date >= '{query_date}' order by finance_meeting_id limit {limit_size} offset {offset_size} """.format(columns_str=columns_str, limit_size=limit_size, offset_size=offset_size, query_date=query_date) select_sql_ls.append(tmp_sql) offset_size = offset_size + limit_size else: tmp_sql = """ select {columns_str} from 01_datamart_layer_007_h_cw_df.finance_meeting_bill where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null) AND pstng_date >= '{query_date}' """.format(columns_str=columns_str, query_date=query_date) select_sql_ls.append(tmp_sql) # print('*** tmp_sql => ', tmp_sql) log.info(f'*** 开始分页查询,一共 {len(select_sql_ls)} 页') start_time = time.perf_counter() # threadPool = ThreadPoolExecutor(max_workers=10, thread_name_prefix="thr") # all_task = [threadPool.submit(exec_task, (sel_sql)) for sel_sql in select_sql_ls] # wait(all_task, return_when=ALL_COMPLETED) # threadPool.shutdown(wait=True) if count_records > 0: pool = Pool(10) results = [] for sel_sql in select_sql_ls: rst = pool.spawn(exec_task, sel_sql) results.append(rst) gevent.joinall(results) consumed_time = round(time.perf_counter() - start_time) log.info(f'* 处理 {count_records} 条记录, 共操作耗时 {consumed_time} sec') test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file) refresh_linshi_table() init_file()
def upload_hdfs_file(year): dest_file = get_dest_file(year) upload_hdfs_path = get_upload_hdfs_path(year) test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)
# -*- coding: utf-8 -*- from report.commons.logging import get_logger from report.commons.test_hdfs_tools import HDFSTools as Test_HDFSTools log = get_logger(__name__) dest_file = "/you_filed_algos/app/doc/finance_province_city.txt" upload_hdfs_path = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_province_city/finance_province_city.txt' conn_type = 'prod' # test prod test_hdfs = Test_HDFSTools(conn_type=conn_type) test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)