def save_data_list_to_disk2(page_index): page_size = 1000 print("page_size=%s,page_index=%s" % (page_size, page_index)) # print config_dict data_type = int(config_dict["data_type"]) data_list_folder_name = config_dict["data_list_folder_name"] # data_list保存文件名(文件按页码存储,每页PAGE_SIZE条) data_list_filename = "data_list_%s_%s.json" % (data_type, page_index) data_list_filename = data_list_folder_name + data_list_filename # 列表页url(从配置文件读取) data_list_url = cf.get("access_url", "data_list_url") data_list_url = data_list_url.format(data_type, page_index, page_size) logging.debug("数据采集地址:%s" % (data_list_url)) # 数据采集并保存到本地 data_list_data = access_data_utils.get_data(data_list_url) file_utils.write_file(data_list_filename, data_list_data) logging.debug("写入文件成功:%s" % (data_list_filename)) logging.info("第%s页数据采集完成,剩余%s页,保存路径:%s" % (page_index, (total_page_no - page_index), data_list_filename)) time.sleep(2)
def save_data_list_to_disk(): # 每页显示总数量(即每个文件保存1000条数据) page_size = 1000 if total_count < page_size: page_size = total_count # 计算共多少页 if total_count % page_size == 0: total_page_no = total_count / page_size else: total_page_no = total_count / page_size + 1 for index in range(total_page_no): page_index = index + 1 # data_list保存文件名 data_list_filename = "data_list_%s_%s.json" % ( dataTypeConfig.get_data_type(), page_index) data_list_filename = DATA_LIST_PATH + data_list_filename # 列表页url app_list_url = dataTypeConfig.get_data_list_url() app_list_url = app_list_url.format(dataTypeConfig.get_data_type(), page_index, page_size) # 数据采集并保存到本地 data_list_data = access_data_utils.get_data(app_list_url) file_utils.write_file(data_list_filename, data_list_data) time.sleep(2)
def get_curr_nmpa_total_count(data_type): try: #访问url data_list_url = cf.get("access_url", "data_list_url") data_list_url = data_list_url.format(data_type, 1, 1) #数据采集 data_list_data = access_data_utils.get_data(data_list_url) jsonData = json.loads(data_list_data) return int(jsonData[0]["COUNT"]) except BaseException, e: raise e
def save_data_list_to_disk(config_dict): data_type = int(config_dict["data_type"]) get_type = int(config_dict["get_type"]) data_list_folder_name = config_dict["data_list_folder_name"] if file_utils.clear_folder(data_list_folder_name): logging.info("清空文件夹文件:%s" % (data_list_folder_name)) begin_time = time.time() logging.info("数据采集开始时间:%s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) # 当前官网数据总量 total_count = comm_utils.get_curr_nmpa_total_count(data_type) # 每页显示总数量(即每个文件保存1000条数据) page_size = 1000 if total_count < page_size: page_size = total_count # 计算共多少页 if total_count % page_size == 0: total_page_no = total_count / page_size else: total_page_no = total_count / page_size + 1 # logging("数据总量=%s,每页采集量=%s,共计%s页:" % (total_count,page_size,total_page_no)) logging.info("当前NMPA官网数据:data_type=%s,数据总量=%s,每页采集量%s,共计%s页" % (data_type, total_count, page_size, total_page_no)) for index in range(total_page_no): page_index = index + 1 # data_list保存文件名(文件按页码存储,每页PAGE_SIZE条) data_list_filename = "data_list_%s_%s.json" % (data_type, page_index) data_list_filename = data_list_folder_name + data_list_filename # 列表页url(从配置文件读取) data_list_url = cf.get("access_url", "data_list_url") data_list_url = data_list_url.format(data_type, page_index, page_size) logging.debug("数据采集地址:%s" % (data_list_url)) # 数据采集并保存到本地 data_list_data = access_data_utils.get_data(data_list_url) file_utils.write_file(data_list_filename, data_list_data) logging.debug("写入文件成功:%s" % (data_list_filename)) logging.info("第%s页数据采集完成,剩余%s页,保存路径:%s" % (page_index, (total_page_no - page_index), data_list_filename)) time.sleep(2) end_time = time.time() logging.info("数据采集结束时间:%s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) logging.info("数据采集共计耗时:%s秒" % (end_time - begin_time))
def get_data_info(thread_name, config_dict): #新增数据保存路径 data_info_save_folder_name = config_dict["data_info_save_folder_name"] while not DATA_LIST.empty(): try: data_id = DATA_LIST.get() info = "线程名:%s,没有获取到数据还有 %d个" % (thread_name, DATA_LIST.qsize()) logging.info(info) # data_info保存文件名 save_filename = "data_info_thread_%s.json" % (thread_name) save_filename = data_info_save_folder_name + save_filename # 列表详情页url data_info_url = cf.get("access_url", "data_info_url") data_info_url = data_info_url.format(config_dict["data_type"], data_id) # 数据采集并保存到本地 data_info_data = access_data_utils.get_data(data_info_url) data_info_data = data_info_data.replace( "\\n\\r", "").decode("gbk").encode("utf-8") #将数据标识和数据一起存储 data = str(data_id) + "==" + data_info_data with open(save_filename, 'a') as f: f.writelines(data + "\n") info = save_filename + "写入成功! id: " + str(data_id) logging.debug(info) # 休眠2秒,防止服务器判断为攻击 time.sleep(2) except urllib2.URLError as e: DATA_LIST.put(data_id) print("URLError") logging.error("URLError") logging.error(e.message) except UnboundLocalError as e: print("UnboundLocalError") logging.error("UnboundLocalError") logging.error(e.message)
def get_data_info(NEW_DATA_LIST,config_dict): #新增数据保存路径 data_info_save_folder_name = config_dict["data_info_save_folder_name"] # 清空data_info > save文件夹 if file_utils.clear_folder(data_info_save_folder_name): logging.info("清空文件夹文件:%s" % (data_info_save_folder_name)) while not NEW_DATA_LIST.empty(): try: data_id = NEW_DATA_LIST.get() info = "没有获取到数据还有 %d个" % (NEW_DATA_LIST.qsize()) logging.info(info) # data_info保存文件名 save_filename = "get_new_data.json" save_filename = data_info_save_folder_name + save_filename # 列表详情页url data_info_url = cf.get("access_url" ,"data_info_url") data_info_url = data_info_url.format(config_dict["data_type"], data_id) # 数据采集并保存到本地 data_info_data = access_data_utils.get_data(data_info_url) # access_data_utils.get_test_timeout() data_info_data = data_info_data.replace("\\n\\r", "").decode("gbk").encode("utf-8") #将数据标识和数据一起存储 data = str(data_id) + "==" + data_info_data with open(save_filename, 'a') as f: f.writelines(data + "\n") info = save_filename + "写入成功! id: " + str(data_id) logging.debug(info) # 休眠2秒,防止服务器判断为攻击 time.sleep(2) except urllib2.URLError as e: logging.error("获取新增数据采集失败! %s" %(e.args)) raise e
def get_data_info(thread_name): while not id_list.empty(): try: data_id = id_list.get() info = "线程名:%s,没有获取到数据还有 %d个" % (thread_name, id_list.qsize()) print(info) # data_info保存文件名 data_info_filename = "data_info_thread_%s.json" % (thread_name) data_info_filename = DATA_INFO_PATH + data_info_filename # 列表详情页url data_info_url = dataTypeConfig.get_data_info_url() data_info_url = data_info_url.format( dataTypeConfig.get_data_type(), data_id) # 数据采集并保存到本地 data_info_data = access_data_utils.get_data(data_info_url) data_info_data = data_info_data.replace( "\\n\\r", "").decode("gbk").encode("utf-8") with open(data_info_filename, 'a') as f: f.writelines(data_info_data + "\n") info = data_info_filename + "写入成功! id: " + data_id print(info) # 休眠1秒,防止服务器判断为攻击 time.sleep(1) # logging.info(info) except urllib2.URLError as e: id_list.put(data_id) # time.sleep(6000) print("URLError") logging.error("URLError") logging.error(e.message) except UnboundLocalError as e: print("UnboundLocalError") logging.error("UnboundLocalError") logging.error(e.message)
from utils import access_data_utils print access_data_utils.get_data( "http://mobile.cfda.gov.cn/datasearch/QueryList?pageIndex=15701&pageSize=10&tableId=26&searchF=Quick%20" )