def get_new_data(config_dict): """ 获取当日新增数据 :param config_dict: :return: """ curr_date = file_utils.get_curr_date() add_folder_name = config_dict["add_folder_name"] add_filename = add_folder_name + "add_data.json" NEW_DATA_LIST = file_utils.get_add_data_id(add_filename) add_data_count = NEW_DATA_LIST.qsize() # logging.info("[data_info]采集日期=%s,计划新增数据采集数据总量=:%s" % (curr_date, add_data_count)) # data_info > save 数据采集总量检查 data_info_save_folder_name = config_dict["data_info_save_folder_name"] file_list = file_utils.get_file_list(data_info_save_folder_name) data_info_count = file_utils.data_info_count(file_list) # logging.info("[data_info]采集日期=%s,实际新增数据采集数据总量=:%s" % (curr_date, data_info_count)) #数据采集前判断是否已经采集完成 if add_data_count == data_info_count: logging.info("采集日期=%s,新增数据采集已经完成!" % (curr_date)) return 1 try: logging.info("开始采集今日[%s]新增数据" % (curr_date)) get_data_info(NEW_DATA_LIST,config_dict) return 1 except BaseException, e: return -1
def main(): global config config, base_dir_dict = setup_utils.setup() tag_omit_list = ['html', 'head', 'body', 'p', 'br', 'em', 'time', 'strong', 'i', 'b', 'code', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'table', 'thead', 'tbody', 'tr', 'td', 'tfoot'] xml_parser = etree.HTMLParser() # re_ulol = regex.compile(r'(\<(ul|ol)(?!href).*href[ ="]+https*\:\/\/[^=\>]+=http[^\>]+\>(?!</\2\>).*\<(\/\2)\>)') # re_li_a = regex.compile(r'(<li[^\>]*\>\s*\<a[^\>]+\shref\=("(javascript|http|\/)[^"]+")[^\>]*\>(?!\<\/a\>).*?\<\/a\>.*?\<\/li\>)') # re_href = regex.compile(r'(\<a[^\>]+\shref\=("http[^"]+")[^\>]*\>(?!\<\/a\>).*?\<\/a\>)') # re_tags = regex.compile(r'(\<(\w+)[^\>]*\>((?!</\2\>).*?)?\<(\/\2)\>)') re_empty = regex.compile(r'(\<([^ /">]+)[^\>]*\>\s*\<\/\2\>)') re_head_foot = regex.compile(r'(\<(header|footer|form|script|noscript|iframe|button)[^\>]*\>((?!</\2\>).*?)?\<\/\2\>)') re_input = regex.compile(r'(\<(input)[^\>]*\>((?!</\2\>).*?)?\/>)') re_comment = regex.compile(r'(\<\!\-\-((?!\-\-\>).*?)?\-\-\>)') re_tag_name = regex.compile(r'^<([^ /">]+)') re_reverse = regex.compile(r'((?r)\<(\w+)(?!\w).*?\/>)', regex.DOTALL) if not config: print('Failed to complete setup, exiting.') sys.exit() else: if 'data_list' in config: for data_dir in config['data_list']: if 'input' in base_dir_dict: i_dir = base_dir_dict['input'].joinpath(data_dir).resolve() if 'work' in base_dir_dict: w_dir = base_dir_dict['work'].joinpath(data_dir).resolve() if 'output' in base_dir_dict: o_dir = base_dir_dict['output'].joinpath(data_dir).resolve() if not w_dir.is_dir(): w_dir.mkdir(parents=True) if not o_dir.is_dir(): o_dir.mkdir(parents=True) # # # print('\nPretty printing in: ' + str(i_dir)) input_file_list = file_utils.get_file_list(root_path_str=str(i_dir)) working_file_list = [] seq_2 = None common_elements = None for pp_file in input_file_list: print('Working with: ' + str(pp_file)) pp_soup = html_utils.make_a_soup(filename=pp_file) base_string = str(pp_soup).replace('\n', '') rever = re_reverse.findall(base_string) for r in rever: print(r[0].replace('/>', '>' + '</' + r[1] + '>')) break working_soup = html_utils.make_a_soup(html_doc_string=working_string) output_filename = o_dir.joinpath('minimized_' + working_filename.stem.replace('_pp', '') + '.html') print('output filename: ' + str(output_filename)) file_utils.write_file(fn=output_filename, overwrite=True, content=working_soup.prettify()) print('Done with: ' + str(w_dir)) # break else: print('data list not in config.')
def scan_keyword(path, ext, keyword): """ 在特定扩展名的文件中搜寻指定关键词, 如在指定目录下所有.a文件中搜寻 UIStatusBar_Modern """ file_list = file_utils.get_file_list(path, ext) matched_files = [] for f in file_list: if file_contains_keyword(f, keyword): matched_files.append(f) return matched_files
def get_html_file_list(html_dir): file_list = file_utils.get_file_list(root_path_str=html_dir, file_stem_str='*', file_ext_str='*', recursive_bool=False, rtn_abs_path_bool=True, rtn_uri=False) if 0 < len(file_list) < 2: print('Only one file found, exiting.') return None # xml = file_list[0].read_text() elif 1 < len(file_list) : # print(file_list[0]) return file_list else: print('File not found, exiting.') return None
def check_data(config_dict): data_list_folder_name = config_dict["data_list_folder_name"] data_type = config_dict["data_type"] file_list = file_utils.get_file_list(data_list_folder_name) id_list = file_utils.get_data_info_id(file_list) qsize = id_list.qsize() logging.info("数据采集总量=:%s" % (qsize)) total_count = comm_utils.get_curr_nmpa_total_count(data_type) logging.info("当前NMPA官网数据总量=:%s" % (total_count)) if qsize == total_count: return True else: return False
def check_data(curr_date): get_type = cf.get("base_config", "get_type") # 该参数暂时未生效 data_type = cf.get("base_config", "data_type") root_path = cf.get("base_config", "root_path") config_dict = config.get_config(root_path, data_type, curr_date) # 3. 获取待新增的数据 add_folder_name = config_dict["add_folder_name"] add_filename = add_folder_name + "add_data.json" ADD_DATA_LIST = file_utils.get_add_data_id(add_filename) add_data_count = ADD_DATA_LIST.qsize() logging.info("[data_info]采集日期=%s,计划新增数据采集数据总量=:%s" % (curr_date, add_data_count)) # data_info > save 数据采集总量检查 data_info_save_folder_name = config_dict["data_info_save_folder_name"] file_list = file_utils.get_file_list(data_info_save_folder_name) data_info_count = file_utils.data_info_count(file_list) logging.info("[data_info]采集日期=%s,实际新增数据采集数据总量=:%s" % (curr_date, data_info_count)) if add_data_count == data_info_count: logging.info("[data_info]采集日期=%s,数据采集完成!" % (curr_date))
def check_data(config_dict): data_list_folder_name = config_dict["data_list_folder_name"] data_type = config_dict["data_type"] file_list = file_utils.get_file_list(data_list_folder_name) id_list = file_utils.get_data_info_id(file_list) qsize = id_list.qsize() logging.info("查询[data_list]文件夹中数据已采集总量=:%s" % (qsize)) try: # comm_utils.access_data_utils.get_test_timeout() total_count = comm_utils.get_curr_nmpa_total_count(data_type) logging.info("查询当前NMPA官网数据总量=:%s" % (total_count)) if qsize == total_count: return True else: return False return True except BaseException, e: logging.error("查询当前NMPA官网数据总量查询失败!") logging.error("查询当前NMPA官网数据超时>>>>%s" % (e.args)) raise e
total_count = int(dataTypeConfig.get_total_count()) #data_list数据保存路径 DATA_LIST_PATH = save_root_path + "/data_list/" file_utils.mkdir_path(DATA_LIST_PATH) # 器械详情保存路径 DATA_INFO_PATH = save_root_path + "/data_info/" file_utils.mkdir_path(DATA_INFO_PATH) #日志保存路径 LOG_PATH = save_root_path + "/logs/" file_utils.mkdir_path(LOG_PATH) #获取采集数据Id集合 file_list = file_utils.get_file_list(DATA_LIST_PATH) id_list = file_utils.get_all_data_id(file_list) print("采集数据总量:%s" % (id_list.qsize())) ''' 获取data_info数据 ''' def start(threadCount): # 启动60个线程,如果cfda官网服务拒绝链接,可适当将线程数量调小一点 threads = [] for i in range(threadCount): thread = myThread(i) # 区分其他线程名字 # 添加线程到线程列表 threads.append(thread)
def main(): global config config, base_dir_dict = setup_utils.setup() re_tags = regex.compile(r'(\<(\w+)[^\>]*\>((?!</\2\>).*?)?\<(\/\2)\>)') if not config: print('Failed to complete setup, exiting.') sys.exit() else: if 'data_list' in config: for data_dir in config['data_list']: if 'input' in base_dir_dict: i_dir = base_dir_dict['input'].joinpath(data_dir).resolve() if 'work' in base_dir_dict: w_dir = base_dir_dict['work'].joinpath(data_dir).resolve() if 'output' in base_dir_dict: o_dir = base_dir_dict['output'].joinpath( data_dir).resolve() if not w_dir.is_dir(): w_dir.mkdir(parents=True) if not o_dir.is_dir(): o_dir.mkdir(parents=True) print('\nWorking in: ' + str(w_dir)) html_file_list = file_utils.get_file_list( root_path_str=str(w_dir), file_stem_str='*_pp', file_ext_str='html') base_filename = html_file_list.pop(0) base_string = html_utils.make_a_string(filename=base_filename) if not base_filename.suffix: suff = '' else: suff = base_filename.suffix base_string = html_utils.make_a_string(filename=base_filename) m2 = re_tags.findall(base_string, overlapped=True) # m2 = regex.findall(, base_string, overlapped=True) m3 = [] for tup in m2: if tup[1] not in [ 'html', 'head', 'body', 'p', 'em', 'i', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7' ]: m3.append(tup) for next_filename in html_file_list: next_string = html_utils.make_a_string( filename=next_filename) for tup in m3: next_string = next_string.replace(tup[0], '') next_soup = html_utils.make_a_soup( html_doc_string=next_string) next_fn = o_dir.joinpath( 'minimized_' + next_filename.stem.replace('_pp', '') + '.html') print('next fn: ' + str(next_fn)) file_utils.write_file(fn=next_fn, overwrite=True, content=next_soup.prettify()) # print('Done.') # break else: print('data list not in config.')
def main(): global config config, base_dir_dict = setup_utils.setup() tag_omit_list = [ 'html', 'head', 'body', 'p', 'br', 'em', 'time', 'strong', 'i', 'b', 'code', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'table', 'thead', 'tbody', 'tr', 'td', 'tfoot' ] # re_ulol = regex.compile(r'(\<(ul|ol)(?!href).*href[ ="]+https*\:\/\/[^=\>]+=http[^\>]+\>(?!</\2\>).*\<(\/\2)\>)') re_li_a = regex.compile( r'(<li[^\>]*\>\s*\<a[^\>]+\shref\=("(javascript|http|\/)[^"]+")[^\>]*\>(?!\<\/a\>).*?\<\/a\>.*?\<\/li\>)' ) # re_href = regex.compile(r'(\<a[^\>]+\shref\=("http[^"]+")[^\>]*\>(?!\<\/a\>).*?\<\/a\>)') re_tags = regex.compile(r'(\<(\w+)[^\>]*\>((?!</\2\>).*?)?\<(\/\2)\>)') re_empty = regex.compile(r'(\<(div)[^\>]*\>\s*\<\/\2\>)') re_head_foot = regex.compile( r'(\<(header|footer|form|script)[^\>]*\>((?!</\2\>).*?)?\<\/\2\>)') re_input = regex.compile(r'(\<(input)[^\>]*\>((?!</\2\>).*?)?\/>)') re_comment = regex.compile(r'(\<\!\-\-((?!\-\-\>).*?)?\-\-\>)') if not config: print('Failed to complete setup, exiting.') sys.exit() else: if 'data_list' in config: for data_dir in config['data_list']: if 'input' in base_dir_dict: i_dir = base_dir_dict['input'].joinpath(data_dir).resolve() if 'work' in base_dir_dict: w_dir = base_dir_dict['work'].joinpath(data_dir).resolve() if 'output' in base_dir_dict: o_dir = base_dir_dict['output'].joinpath( data_dir).resolve() if not w_dir.is_dir(): w_dir.mkdir(parents=True) if not o_dir.is_dir(): o_dir.mkdir(parents=True) # # # print('\nPretty printing in: ' + str(i_dir)) pp_file_list = file_utils.get_file_list( root_path_str=str(i_dir)) for pp_file in pp_file_list: pp_soup = html_utils.make_a_soup(filename=pp_file) # if not pp_file.suffix: # suff = '.html' # else: # suff = pp_file.suffix suff = '.html' pp_filename = w_dir.joinpath( str(pp_file.stem) + '_pp' + suff) file_utils.write_file(fn=pp_filename, overwrite=True, content=pp_soup.prettify()) print('Done pretty printing in: ' + str(i_dir)) print('Working in: ' + str(w_dir)) html_file_list = file_utils.get_file_list( root_path_str=str(w_dir), file_stem_str='*_pp', file_ext_str='html') base_filename = html_file_list[0] base_string = html_utils.make_a_string(filename=base_filename) m2 = re_tags.findall(base_string, overlapped=True) # m2 = regex.findall(, base_string, overlapped=True) m3 = [] for tup in m2: if tup[1] not in tag_omit_list: omit_flag = False for omit in tag_omit_list: if '<' + omit in tup[1]: omit_flag = True break if not omit_flag: m3.append(tup) for next_filename in html_file_list: next_string = html_utils.make_a_string( filename=next_filename) for tup in m3: next_string = next_string.replace(tup[0], '') m4 = re_comment.findall(next_string) while m4: for m in m4: # print(m) # sys.exit() next_string = next_string.replace(m[0], '') m4 = re_comment.findall(next_string) m4 = re_li_a.findall(next_string) while m4: for m in m4: # print(m) # sys.exit() next_string = next_string.replace(m[0], '') m4 = re_li_a.findall(next_string) # # m4 = re_href.findall(next_string) # while m4: # for m in m4: # # print(m) # # sys.exit() # next_string = next_string.replace(m[0], '') # m4 = re_href.findall(next_string) # m4 = re_head_foot.findall(next_string) while m4: for m in m4: next_string = next_string.replace(m[0], '') m4 = re_head_foot.findall(next_string) # m4 = re_input.findall(next_string) while m4: for m in m4: next_string = next_string.replace(m[0], '') m4 = re_input.findall(next_string) # m4 = re_empty.findall(next_string, overlapped=True) while m4: for m in m4: next_string = next_string.replace(m[0], '') m4 = re_empty.findall(next_string, overlapped=True) next_soup = html_utils.make_a_soup( html_doc_string=next_string) next_fn = o_dir.joinpath( 'minimized_' + next_filename.stem.replace('_pp', '') + '.html') print('next fn: ' + str(next_fn)) file_utils.write_file(fn=next_fn, overwrite=True, content=next_soup.prettify()) # break print('Done with: ' + str(w_dir)) # break else: print('data list not in config.')
def data_process(config_dict): """ 本次采集数据与当前数据库对比,将新增、减少数据存入add、reduct文件夹 :param config_dict: :return: """ add_filename = config_dict["add_folder_name"] + ADD_DATA_FILENAME reduce_filename = config_dict["reduct_folder_name"] + REDUCE_DATA_FILENAME data_list_folder_name = config_dict["data_list_folder_name"] if os.path.exists(add_filename): logging.info("数据分析:本次新增减少[add/reduct]文件已经存储:%s" % (add_filename)) return data_type = cf.get("base_config", "data_type") root_path = cf.get("base_config", "root_path") """ 获取上一天数据总量(上一天可能采集失败,继续循环获取上一天,直到获取数据) """ last_date_num = 1 last_date = file_utils.get_last_date(last_date_num) last_data_list_folder_name = config.get_last_root_path( root_path, data_type, last_date) file_list = file_utils.get_file_list(last_data_list_folder_name + "/data_list/") id_list = file_utils.get_data_info_id(file_list) curr_data_id_list = list(id_list.queue) curr_data_id_list_len = len(curr_data_id_list) while curr_data_id_list_len == 0: logging.warn("数据分析:[%s]数据采集数量: %s" % (last_date, curr_data_id_list_len)) last_date_num += 1 last_date = file_utils.get_last_date(last_date_num) last_data_list_folder_name = config.get_last_root_path( root_path, data_type, last_date) file_list = file_utils.get_file_list(last_data_list_folder_name + "/data_list/") id_list = file_utils.get_data_info_id(file_list) curr_data_id_list = list(id_list.queue) curr_data_id_list_len = len(curr_data_id_list) logging.info("数据分析:[%s]数据采集数量: %s" % (last_date, curr_data_id_list_len)) file_list = file_utils.get_file_list(data_list_folder_name) id_list = file_utils.get_data_info_id(file_list) new_data_id_list = list(id_list.queue) logging.info("数据分析:今天[%s]数据采集数量: %s" % (file_utils.get_curr_date(), len(new_data_id_list))) # 本次新增数据 add_data = list(set(new_data_id_list).difference( set(curr_data_id_list))) # b中有而a中没有的 file_utils.write_file(add_filename, str(add_data)) logging.info("数据分析:本次新增数据:%s" % (len(add_data))) if len(add_data) > 0: logging.info("本次新增数据标识已经保存") # 本次减少数据 reduce_data = list( set(curr_data_id_list).difference(set(new_data_id_list))) # a中有而b中没有的 file_utils.write_file(reduce_filename, str(reduce_data)) logging.info("数据分析:本次减少数据:%s" % (len(reduce_data))) if len(reduce_data) > 0: logging.info("本次减少数据标识已经保存")
curr_date = file_utils.get_curr_date() curr_root_path = config.get_curr_root_path(root_path, data_type, curr_date) # 1.读取配置信息 if not os.path.exists(curr_root_path + config_filename): print("程序运行基础配置信息:%s:未初始化,请先运行init.py!" % (config_filename)) sys.exit(0) else: config_dict = config.get_config(root_path, data_type, curr_date) # 2.初始化日志 log_utils.log_config(curr_root_path + log_name) # 3. 获取待新增的数据 data_list_folder_name = config_dict["data_list_folder_name"] file_list = file_utils.get_file_list(data_list_folder_name) DATA_LIST = file_utils.get_data_info_id(file_list) add_data_count = DATA_LIST.qsize() logging.info("[data_info]采集日期=%s,计划新增数据采集数据总量=:%s" % (curr_date, add_data_count)) # data_info > save 数据采集总量检查 data_info_save_folder_name = config_dict["data_info_save_folder_name"] file_list = file_utils.get_file_list(data_info_save_folder_name) data_info_count = file_utils.data_info_count(file_list) logging.info("[data_info]采集日期=%s,实际新增数据采集数据总量=:%s" % (curr_date, data_info_count)) if add_data_count == data_info_count: logging.info("采集日期=%s,新增数据采集已经完成!" % (curr_date)) sys.exit(0)
def main(): global config config, base_dir_dict = setup_utils.setup() if not config: print('Failed to complete setup, exiting.') sys.exit() else: if 'data_list' in config: for data_dir in config['data_list']: if 'input' in base_dir_dict: i_dir = base_dir_dict['input'].joinpath(data_dir).resolve() if 'work' in base_dir_dict: w_dir = base_dir_dict['work'].joinpath(data_dir).resolve() if 'output' in base_dir_dict: o_dir = base_dir_dict['output'].joinpath(data_dir).resolve() if not w_dir.is_dir(): w_dir.mkdir(parents=True) if not o_dir.is_dir(): o_dir.mkdir(parents=True) # root_path_str, file_stem_str='*', file_ext_str='*', recursive_bool=False, xclude_hidden_paths=True, rtn_abs_path_bool=True, rtn_uri=False print('Working in: ' + str(i_dir)) html_file_list = file_utils.get_file_list(root_path_str=str(i_dir)) base_filename = html_file_list.pop(0) # # # base_soup = html_utils.make_a_soup(filename=base_filename) if not base_filename.suffix: suff = '.html' else: suff = base_filename.suffix pp_filename = w_dir.joinpath(str(base_filename.stem) + '_pp' + suff) file_utils.write_file(fn=pp_filename, overwrite=True, content=base_soup.prettify()) # success = utils.html_write_ppsoup(base_soup, 'test/base_soup.html') for next_filename in html_file_list: next_soup = html_utils.make_a_soup(filename=next_filename) if not next_filename.suffix: suff = '.html' else: suff = next_filename.suffix pp_filename = w_dir.joinpath(str(next_filename.stem) + '_pp' + suff) file_utils.write_file(fn=pp_filename, overwrite=True, content=next_soup.prettify()) base_list, next_list, block_match = html_utils.diff_a_soup(s1=base_soup, s2=next_soup) base_list_filename = w_dir.joinpath(base_filename.stem + '_tabs' + '.txt') next_list_filename = w_dir.joinpath(next_filename.stem + '_tabs' + '.txt') file_utils.write_file(fn=base_list_filename, overwrite=True, # content=pprint.pformat(base_list)) content=base_list) file_utils.write_file(fn=next_list_filename, overwrite=True, # content=pprint.pformat(next_list)) content=next_list) block_list_filename = o_dir.joinpath(base_filename.stem + '__' + next_filename.stem + '.txt') file_utils.write_file(fn=block_list_filename, overwrite=True, content=pprint.pformat(block_match))
#coding=utf-8 import shutil from utils import file_utils import os """ 清空文件夹文件 """ data_list_folder_name = 'E:/data/data_source/26/20190727/data_list' file_list = file_utils.get_file_list(data_list_folder_name) for file_name in file_list: print os.remove(file_name)