async def _handle_al_goods_is_delete(self, goods_id) -> dict: ''' 处理商品无法查看或者下架的 :return: ''' try: sql_cli = SqlServerMyPageInfoSaveItemPipeline() is_in_db = sql_cli._select_table(sql_str=al_select_str_1, params=(str(goods_id), )) # self.lg.info(str(is_in_db)) except Exception: self.lg.error('数据库连接失败!' + self.error_base_record, exc_info=True) return await self._data_error_init() self.result_data = {} # 初始化下架商品的属性 tmp_data_s = await self._init_al_pull_off_shelves_goods() if is_in_db != []: # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可 sql_cli._update_table_2(sql_str=al_update_str_1, params=(goods_id), logger=self.lg) self.lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1') # 用来判断原先该goods是否在db中 tmp_data_s['before'] = True else: # 表示该goods_id没存在于db中 self.lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...') tmp_data_s['before'] = False return tmp_data_s
class JDUpdater(AsyncCrawler): """jd常规商品更新""" def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/jd/实时更新/' ) self.sql_cli = None self.goods_index = 1 # 并发量 self.concurrency = 10 async def _get_db_old_data(self): self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: result = list(self.sql_cli._select_table(sql_str=jd_select_str_1)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_jd_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.jd except: pass collect() self.jd = JdParse(logger=self.lg) async def _get_tmp_item(self, site_id, goods_id): tmp_item = [] if site_id == 7 or site_id == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif site_id == 9: tmp_item.append(1) elif site_id == 10: tmp_item.append(2) tmp_item.append(goods_id) return tmp_item async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个jd商品信息 :param db_goods_info_obj: :param index: :return: ''' res = False await self._get_new_jd_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format( db_goods_info_obj.goods_id, index)) tmp_item = await self._get_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id,) data = self.jd.get_goods_data(goods_id=tmp_item) if data.get('is_delete', 1) == 1: self.lg.info('该商品已下架...') self.sql_cli._update_table_2( sql_str=jd_update_str_2, params=(str(get_shanghai_time()), tmp_item[1],), logger=self.lg) await async_sleep(1.2) index += 1 self.goods_index = index return db_goods_info_obj.goods_id, index data = self.jd.deal_with_data(goods_id=tmp_item) if data != {}: data = get_goods_info_change_data( target_short_name='jd', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj,) self.jd.to_right_and_update_data(data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(1.2) # 避免被发现使用代理 return db_goods_info_obj.goods_id, index async def _update_db(self): while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.jd = JdParse(logger=self.lg) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: db_goods_info_obj = JDDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id)) tasks.append(self.loop.create_task(self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.5) try: del self.jd except: pass collect() def __del__(self): try: del self.lg except: pass try: del self.loop except:pass collect()
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str( goods_id) self.lg.info(self.msg) # 获取主接口的body last_url = self._get_last_url(goods_id=goods_id) data = Requests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14, high_conceal=True, ip_pool_type=self.ip_pool_type) if data == '': self.lg.error('出错goods_id: {0}'.format((goods_id))) return self._data_error_init() try: data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)[ 0] # 贪婪匹配匹配所有 # self.lg.info(str(data)) except IndexError: self.lg.error('data为空! 出错goods_id: {0}'.format(goods_id)) return self._data_error_init() data = json_2_dict(json_str=data, logger=self.lg) if data == {}: self.lg.error('出错goods_id: {0}'.format(str(goods_id))) return self._data_error_init() # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: ''' ## 表示该商品已经下架, 原地址被重定向到新页面 ''' self.lg.info('@@@@@@ 该商品已经下架...') _ = SqlServerMyPageInfoSaveItemPipeline() if _.is_connect_success: _._update_table_2(sql_str=tb_update_str_3, params=(goods_id, ), logger=self.lg) try: del _ except: pass tmp_data_s = self.init_pull_off_shelves_goods() self.result_data = {} return tmp_data_s # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data').get('seller', {}).get('evaluates') is None: self.lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架') return self._data_error_init() data['data']['rate'] = '' # 这是宝贝评价 data['data']['resource'] = '' # 买家询问别人 data['data']['vertical'] = '' # 也是问和回答 data['data']['seller']['evaluates'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0][ 'value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value) # 处理mockData mock_data = result_data['mockData'] mock_data = json_2_dict(json_str=mock_data, logger=self.lg) if mock_data == {}: self.lg.error('出错goods_id: {0}'.format(goods_id)) return self._data_error_init() mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.lg.info( "result_data.get('apiStack', [])[0].get('value', '')的值为空....") result_data['trade'] = {} return self._data_error_init() else: result_data['trade'] = result_data.get('apiStack', [])[0].get( 'value', {}).get('trade', {}) # 用于判断该商品是否已经下架的参数 # pprint(result_data['trade']) self.result_data = result_data # pprint(self.result_data) return result_data
class GX8899Spider(Crawler): def __init__(self, logger=None): super(GX8899Spider, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=logger, log_save_path=MY_SPIDER_LOGS_PATH + '/gx8899/_/', is_use_driver=True, driver_executable_path=PHANTOMJS_DRIVER_PATH ) self._set_sort_type_name() self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s' self.id_list = [] self.update_index = 0 def _set_sort_type_name(self): ''' 设置抓取的分类名 :return: ''' self.sort_type_name_list = [ # 'weixin', # 'nansheng', # 'nvsheng', 'fengjing', 'jingxuan', 'wupin', 'oumei', 'weimei', 'heibai', 'baqi', 'xiaoqingxin', 'yijing', 'beiying', 'chouyan', 'sumiao', 'gexing', 'xiaohai', 'qiche', 'zhiwu', 'shouhui', 'weshen', 'mingxing', 'jianzhu', 'renwu', ] def _get_gx8899_all_img_url(self): self.lg.info('即将开始采集gx8899...') fz = [] for sort_type_name in self.sort_type_name_list: tmp = self._get_one_sort_type_name_page_info(sort_type_name) if tmp != []: fz += tmp self.lg.info('@@@ 全部头像抓取完毕!') self.fz = fz return fz def _get_new_wait_2_handle_id_list(self): ''' 获取新的带处理的 :return: ''' sql_str = ''' select top 1000 id from dbo.sina_weibo where sina_type = 'bilibili' and modify_time is null ''' if self.id_list == []: self.lg.info('@@@ 重新获取id_list...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: wait = self.my_pipeline._select_table(sql_str=sql_str) self.id_list = [i[0] for i in wait] except TypeError or IndexError: sleep(8) return [] else: pass return self.id_list @fz_set_timeout(6) def oo(self, id, img_url): try: self.my_pipeline._update_table_2( sql_str=self.update_sql, params=(img_url, get_shanghai_time(), id), logger=self.lg ) except Exception: return False return True def _get_one_sort_type_name_page_info(self, sort_type_name): ''' 得到一个分类的某页信息 :return: ''' base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name) headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Referer': 'http://m.gx8899.com/weixin/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } index = 0 res = [] while True: if index == 0: url = base_url index += 1 # 第二页index_2开始 else: url = base_url + 'index_{0}.html'.format(index) self.lg.info('正在抓取{0}'.format(url)) # 太慢, 改用phantomjs # body = self._get_loop_run_result(url=url, headers=headers) if index % 15 == 0: try: del self.driver except: pass gc.collect() self.driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.lg, ip_pool_type=self.ip_pool_type) self.lg.info('[+] phantomjs已重置!') body = self.driver.get_url_body(url=url) # self.lg.info(str(body)) if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall(body) != []: break need = Selector(text=body).css('div#con_tabone_1 li.last a:last-child ::attr(href)').extract() pprint(need) if need == []: self.lg.error('获取到的need为空list!出错地址:{0}'.format(url)) continue for article_url in need: _ = self._get_one_article_page_info(article_url) if _ != []: res += _ self.lg.info('#### 已更新{0}个id !'.format(self.update_index)) index += 1 return res def _get_one_article_page_info(self, url): ''' 得到一个推荐地址里面所有图片list :param url: :return: ''' headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } # body = self._get_loop_run_result(url=url, headers=headers) body = self.driver.get_url_body(url=url) if body == '': self.lg.info('获取到img list为空list!出错地址:{}'.format(url)) return [] need = Selector(text=body).css('div.content p img ::attr(src)').extract() # pprint(need) # self.lg.info(str(need)) if need != []: self.lg.info('[+] crawl子地址success') else: self.lg.info('[-] crawl子地址fail') # 数据更新操作 for img_url in need: try: random_id_index = randint(0, len(self._get_new_wait_2_handle_id_list())-1) except: sleep(5) continue res = self.oo( id=self.id_list[random_id_index], img_url=img_url, ) if res: self.id_list.pop(random_id_index) self.update_index += 1 return need async def _get_one_page_body(self, url, headers): ''' 异步获取body :param url: :param headers: :return: ''' body = await AioHttp.aio_get_url_body(url=url, headers=headers, ip_pool_type=self.ip_pool_type) return body def _get_loop_run_result(self, **kwargs): loop = get_event_loop() result = loop.run_until_complete(self._get_one_page_body( url=kwargs.get('url', ''), headers=kwargs.get('headers', {}) )) return result def __del__(self): try: del self.driver del self.lg except: pass gc.collect()
class YXGoodsInfoMonitorSpider(AsyncCrawler): """cp goods info monitor""" def __init__(self): AsyncCrawler.__init__( self, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/cp/yx_goods_monitor/', ) self.req_num_retries = 5 self.concurrency = 100 self.concurrent_type = 1 self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.init_sql_str() async def _fck_run(self): while True: self.db_res = await self.get_db_res() await self.get_all_goods_info_and_handle_by_goods_id_list( goods_id_list=[item[0] for item in self.db_res], ) await async_sleep(10.) async def get_db_res(self) -> list: """ 获取目标goods_id_list :return: """ db_res = [] try: db_res = list(self.sql_cli._select_table(sql_str=self.sql_tr0, )) except Exception: self.lg.error('遇到错误:', exc_info=True) assert db_res != [] return db_res async def get_all_goods_info_and_handle_by_goods_id_list( self, goods_id_list: list): """ 根据goods_id_list获取所有goods info并处理 :return: """ async def get_tasks_params_list() -> list: tasks_params_list = [] for goods_id in goods_id_list: tasks_params_list.append({'goods_id': goods_id}) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where goods_id: {}] ...'.format( k['goods_id'], ) def get_now_args(k) -> list: return [ k['goods_id'], ] assert goods_id_list != [] all_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list(), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self.get_goods_info_by_goods_id, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=self.handle_one_res, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res={}, step=self.concurrency, logger=self.lg, concurrent_type=self.concurrent_type, ) return all_res def handle_one_res(self, one_res) -> None: """ 处理单个结果 :return: """ # 每次重连 self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not self.sql_cli.is_connect_success: return # pprint(one_res) for item in one_res: try: goods_id = item.get('goods_id', '') assert goods_id != '' # 会员价 tb_price0 = item.get('tb_price0', 0.) assert tb_price0 != 0. # 优点价 tb_price1 = item.get('tb_price1', 0.) except AssertionError: continue for item in self.db_res: db_goods_id = item[0] db_tb_price = float(item[1]).__round__(2) site_id = item[3] modify_time = item[4] goods_url = item[5] if goods_id == db_goods_id: if tb_price1 != db_tb_price: # 先对比会员价 if tb_price0 != db_tb_price: # 再对比优点价格, 两者都不同才进行修正, 否则pass(原因cp 部分会员价显示错误) now_time = get_shanghai_time() cp_url = 'https://m.yiuxiu.com/Product/Info/{}'.format( goods_id) self.lg.info( 'goods_id: {}, 优点价格: {}, 会员价格: {}, db_tb_price: {}, site_id: {}, modify_time: {}, cp_url: {}, goods_url: {}' .format( goods_id, tb_price1, tb_price0, db_tb_price, site_id, modify_time, cp_url, goods_url, )) self.sql_cli._update_table_2( sql_str=self.sql_tr1, params=( now_time, now_time, now_time, goods_id, ), logger=self.lg, ) else: pass break else: continue return None @catch_exceptions_with_class_logger(default_res={}) def get_goods_info_by_goods_id(self, goods_id: str) -> dict: """ 根据goods_id获取商品信息 :param goods_id: :return: """ def parse_body() -> dict: """ 解析 :return: """ nonlocal body # 多规格的最低价 # 会员价yx 部分显示错误, 改用big 价格加上优点, 两个一起用 tb_price_sel = { 'method': 'css', 'selector': 'div.goodsPriceTips span:nth-child(2) ::text', } big_price_sel = { 'method': 'css', 'selector': 'div.goodsPrice big ::text', } yd_sel = { 'method': 're', 'selector': '<span class=\"yiudianPrice\">\+(\d+)优点</span>' } tb_price0 = parse_field( parser=tb_price_sel, target_obj=body, ) assert tb_price0 != '' big_price = parse_field( parser=big_price_sel, target_obj=body, ) assert big_price != '' yd = parse_field( parser=yd_sel, target_obj=body, ) assert yd != '' # 会员价 tb_price0 = float(tb_price0).__round__(2) # 优点价 tb_price1 = (float(big_price) + float(yd) / 100).__round__(2) return { 'goods_id': goods_id, 'tb_price0': tb_price0, 'tb_price1': tb_price1, } headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False, ) headers.update({ 'authority': 'm.yiuxiu.com', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-site': 'none', }) url = 'https://m.yiuxiu.com/Product/Info/{}'.format(goods_id) body = Requests.get_url_body( url=url, headers=headers, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS, ) assert body != '' # self.lg.info(body) res = parse_body() self.lg.info('[{}] goods_id: {}'.format( '+' if res != {} else '-', goods_id, )) return res def init_sql_str(self): self.sql_tr0 = ''' select MainGoodsID, TaoBaoPrice, Price, SiteID, ModfiyTime, GoodsUrl from dbo.GoodsInfoAutoGet where MainGoodsID is not null and IsDelete=0 ''' self.sql_tr1 = ''' update dbo.GoodsInfoAutoGet set is_spec_change=1, spec_trans_time=%s, ModfiyTime=%s, IsPriceChange=1, sku_info_trans_time=%s, PriceChangeInfo=SKUInfo where MainGoodsID=%s ''' def __del__(self): try: del self.lg del self.loop except: pass collect()
def get_ali_1688_data(self, goods_id): if goods_id == '': return self._data_error_init() wait_to_deal_with_url = 'https://m.1688.com/offer/' + str( goods_id) + '.html' self.my_lg.info( '------>>>| 待处理的阿里1688地址为: {0}'.format(wait_to_deal_with_url)) self.error_base_record = '出错goods_id:{0}'.format(goods_id) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=wait_to_deal_with_url, css_selector='div.d-content') # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空str!请检查!' + self.error_base_record) return self._data_error_init() tmp_body = body try: pull_off_shelves = Selector( text=body).css('div.d-content p.info::text').extract_first() except: pull_off_shelves = '' if pull_off_shelves == '该商品无法查看或已下架': # 表示商品已下架, 同样执行插入数据操作 try: tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline() is_in_db = tmp_my_pipeline._select_table( sql_str=al_select_str_1, params=(str(goods_id), )) # self.my_lg.info(str(is_in_db)) except Exception: self.my_lg.error('数据库连接失败!' + self.error_base_record, exc_info=True) return self._data_error_init() if is_in_db != []: # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可 tmp_my_pipeline._update_table_2(sql_str=al_update_str_1, params=(goods_id), logger=self.my_lg) self.my_lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = True # 用来判断原先该goods是否在db中 self.result_data = {} return tmp_data_s else: # 表示该goods_id没存在于db中 self.my_lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = False self.result_data = {} return tmp_data_s body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall( body) if body != []: body = body[0] body = r'{"beginAmount"' + body # self.my_lg.info(str(body)) body = json_2_dict(json_str=body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges(body=body) return self.result_data else: self.my_lg.error('data为空!' + self.error_base_record) return self._data_error_init() else: self.my_lg.info('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!') body = re.compile( r'{"activityId"(.*?)</script></div></div>').findall(tmp_body) if body != []: body = body[0] body = r'{"activityId"' + body # self.my_lg.info(str(body)) body = json_2_dict(json_str=body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges( body=body) self.is_activity_goods = True return self.result_data else: self.my_lg.error('data为空!' + self.error_base_record) return self._data_error_init() else: self.my_lg.error('这个商品对应活动属性未知, 此处不解析, 设置为跳过!' + self.error_base_record) return self._data_error_init()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() db_goods_info_obj = YXDbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') sql_cli._update_table_2( sql_str=yx_update_str_2, params=(db_goods_info_obj.goods_id, ), logger=my_lg, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='yx', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) yanxuan.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def get_goods_data(self, goods_id): ''' 得到data :param goods_id: :return: data 类型dict ''' if goods_id == []: self.result_data = {} return {} type = goods_id[0] # 天猫类型 # self.my_lg.info(str(type)) goods_id = goods_id[1] # 天猫goods_id tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id) self.my_lg.info('------>>>| 得到的移动端地址为: %s' % tmp_url) self.headers.update({'Referer': tmp_url}) last_url = self._get_last_url(goods_id=goods_id) body = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14) if body == '': self.my_lg.error('出错goods_id: {0}'.format((goods_id))) self.result_data = {} return {} try: assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % (str(type), goods_id) data = re.compile('mtopjsonp3\((.*)\)').findall(body)[0] # 贪婪匹配匹配所有 except (AssertionError, IndexError) as e: self.my_lg.exception(e) self.result_data = {} return {} if data != '': data = json_2_dict(json_str=data, logger=self.my_lg) if data == {}: self.my_lg.error('出错type: %s, goods_id: %s' % (str(type), str(goods_id))) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \ and data.get('data', {}).get('seller', {}).get('evaluates') is None: ''' ## 表示该商品已经下架, 原地址被重定向到新页面 ''' self.my_lg.info('@@@@@@ 该商品已经下架...') _ = SqlServerMyPageInfoSaveItemPipeline() if _.is_connect_success: _._update_table_2(sql_str=tm_update_str_3, params=(goods_id,), logger=self.my_lg) try: del _ except: pass tmp_data_s = self.init_pull_off_shelves_goods(type) self.result_data = {} return tmp_data_s # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data', {}).get('seller', {}).get('evaluates') is None: self.my_lg.error('data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: %s, goods_id: %s' % (str(type), str(goods_id))) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} data['data']['rate'] = '' # 这是宝贝评价 data['data']['resource'] = '' # 买家询问别人 data['data']['vertical'] = '' # 也是问和回答 data['data']['seller']['evaluates'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0]['value'] = self._wash_result_data_apiStack_value( goods_id=goods_id, result_data_apiStack_value=result_data_apiStack_value ) # 处理mockData mock_data = result_data['mockData'] mock_data = json_2_dict(json_str=mock_data, logger=self.my_lg) if mock_data == {}: self.my_lg.error('出错type: {0}, goods_id: {1}'.format(type, goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.my_lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.my_lg.error("result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: %s, goods_id: %s" % (str(type), goods_id)) result_data['trade'] = {} self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: result_data['trade'] = result_data.get('apiStack', [])[0].get('value', {}).get('trade', {}) # 用于判断该商品是否已经下架的参数 # pprint(result_data['trade']) result_data['type'] = type result_data['goods_id'] = goods_id self.result_data = result_data # pprint(self.result_data) return result_data else: self.my_lg.error('data为空! 出错type: %s, goods_id: %s' % (str(type), str(goods_id))) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class CommentRealTimeUpdateSpider(object): def __init__(self): self._set_logger() self.msg = '' self.debugging_api = self._init_debugging_api() self._set_func_name_dict() self.sql_str = cm_update_str_1 if self._init_debugging_api().get(2): self.my_lg.info('初始化 1688 phantomjs中...') self.ali_1688 = ALi1688CommentParse(logger=self.my_lg) if self._init_debugging_api().get(3) is True \ or self._init_debugging_api().get(4) is True\ or self._init_debugging_api().get(6) is True: self.my_lg.info('初始化 天猫 phantomjs中...') self.tmall = TmallCommentParse(logger=self.my_lg) if self._init_debugging_api().get(7) is True \ or self._init_debugging_api().get(8) is True\ or self._init_debugging_api().get(9) is True\ or self._init_debugging_api().get(10) is True: self.my_lg.info('初始化 京东 phantomjs中...') self.jd = JdCommentParse(logger=self.my_lg) def _set_logger(self): self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/all_comment/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) def _init_debugging_api(self): ''' 用于设置待抓取的商品的site_id :return: dict ''' return { 1: True, 2: True, 3: True, 4: True, 6: True, 7: True, 8: True, 9: True, 10: True, 11: False, 12: False, 13: False, 25: False, } def _set_func_name_dict(self): self.func_name_dict = { 'taobao': 'self._update_taobao_comment({0}, {1}, {2})', 'ali': 'self._update_ali_1688_comment({0}, {1}, {2})', 'tmall': 'self._update_tmall_comment({0}, {1}, {2})', 'jd': 'self._update_jd_comment({0}, {1}, {2})', 'zhe_800': 'self._update_zhe_800_comment({0}, {1}, {2})', 'juanpi': 'self._update_juanpi_comment({0}, {1}, {2})', 'pinduoduo': 'self._update_pinduoduo_comment({0}, {1}, {2})', 'vip': 'self._update_vip_comment({0}, {1}, {2})', } def _just_run(self): while True: #### 更新数据 self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-a.modify_time>1 try: result = list(self._comment_pipeline._select_table(sql_str=cm_select_str_1, logger=self.my_lg)) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') continue self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(str(result)) self.my_lg.info('--------------------------------------------------------') self.my_lg.info('待更新个数: {0}'.format(len(result))) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房 11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会 for index, item in enumerate(result): # item: ('xxxx':goods_id, 'y':site_id) if not self.debugging_api.get(item[1]): self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index)) continue if index % 20 == 0: try: del self._comment_pipeline except: pass self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline() switch = { 1: self.func_name_dict.get('taobao'), # 淘宝 2: self.func_name_dict.get('ali'), # 阿里1688 3: self.func_name_dict.get('tmall'), # 天猫 4: self.func_name_dict.get('tmall'), # 天猫超市 6: self.func_name_dict.get('tmall'), # 天猫国际 7: self.func_name_dict.get('jd'), # 京东 8: self.func_name_dict.get('jd'), # 京东超市 9: self.func_name_dict.get('jd'), # 京东全球购 10: self.func_name_dict.get('jd'), # 京东大药房 11: self.func_name_dict.get('zhe_800'), # 折800 12: self.func_name_dict.get('juanpi'), # 卷皮 13: self.func_name_dict.get('pinduoduo'), # 拼多多 25: self.func_name_dict.get('vip'), # 唯品会 } # 动态执行 exec_code = compile(switch[item[1]].format(index, item[0], item[1]), '', 'exec') exec(exec_code) sleep(1.1) def _update_taobao_comment(self, index, goods_id, site_id): ''' 处理淘宝的商品comment :param index: 索引 :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 淘宝\t\t索引值(%s)' % str(index)) taobao = TaoBaoCommentParse(logger=self.my_lg) _r = taobao._get_comment_data(goods_id=str(goods_id)) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') try: del taobao except: self.my_lg.info('del taobao失败!') gc.collect() else: pass def _update_ali_1688_comment(self, index, goods_id, site_id): ''' 处理阿里1688的商品comment :param index: 索引 :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 阿里1688\t\t索引值(%s)' % str(index)) if index % 5 == 0: try: del self.ali_1688 except: self.my_lg.error('del ali_1688失败!') gc.collect() self.ali_1688 = ALi1688CommentParse(logger=self.my_lg) _r = self.ali_1688._get_comment_data(goods_id=goods_id) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') else: pass def _update_tmall_comment(self, index, goods_id, site_id): ''' 处理tmall商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 天猫\t\t索引值(%s)' % str(index)) if site_id == 3: _type = 0 elif site_id == 4: _type = 1 elif site_id == 6: _type = 2 else: return None if index % 5 == 0: try: del self.tmall except: self.my_lg.info('del tmall失败!') gc.collect() self.tmall = TmallCommentParse(logger=self.my_lg) _r = self.tmall._get_comment_data(type=_type, goods_id=str(goods_id)) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') gc.collect() else: pass def _update_jd_comment(self, index, goods_id, site_id): ''' 处理京东商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 京东\t\t索引值(%s)' % str(index)) if index % 5 == 0: try: del self.jd except: self.my_lg.info('del jd失败!') gc.collect() self.jd = JdCommentParse(logger=self.my_lg) _r = self.jd._get_comment_data(goods_id=str(goods_id)) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') else: pass def _update_zhe_800_comment(self, index, goods_id, site_id): ''' 处理折800商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _update_juanpi_comment(self, index, goods_id, site_id): ''' 处理卷皮商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _update_pinduoduo_comment(self, index, goods_id, site_id): ''' 处理拼多多的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _update_vip_comment(self, index, goods_id, site_id): ''' 处理唯品会的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _get_db_update_params(self, item): return ( item['modify_time'], dumps(item['_comment_list'], ensure_ascii=False), item['goods_id'], ) def __del__(self): try: del self.my_lg del self.msg del self.debugging_api except: pass try: del self._comment_pipeline except: pass try: del self.tmall except: pass gc.collect()
class MIUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/蜜芽/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.delete_sql_str = mia_delete_str_3 self.concurrency = 8 # 并发量 self.tmp_sql_server = None self.goods_index = 1 async def _get_pc_headers(self) -> dict: return { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.mia.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } async def _get_db_old_data(self): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=mia_delete_str_4) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=mia_select_str_3)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_end_time(self, miaosha_time): miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_end_time async def _get_new_mia_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.mia_miaosha except: pass collect() self.mia_miaosha = MiaParse() async def _update_one_goods_info(self, item, index) -> tuple: ''' 单个更新 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] pid = item[2] miaosha_end_time = await self._get_miaosha_end_time(miaosha_time) await self._get_new_mia_obj(index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_end_time) if is_recent_time == 0: res = self.tmp_sql_server._update_table_2( sql_str=mia_update_str_6, params=(goods_id, ), logger=self.lg) self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format( goods_id, json_2_dict(miaosha_time).get('miaosha_begin_time'))) await async_sleep(.5) self.goods_index = index + 1 return goods_id, res elif is_recent_time == 2: self.goods_index = index + 1 return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( pid) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) body = '' if body == '' or body == '[]' else body try: tmp_data = json_2_dict(body, default_res={}) assert tmp_data != {}, 'tmp_data为空dict!' except AssertionError: self.lg.error('遇到错误:', exc_info=True) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] # self.lg.info(str(miaosha_goods_all_goods_id)) if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 self.lg.info('该商品已被下架限时秒杀活动,此处将其删除') res = self.tmp_sql_server._update_table_2( sql_str=mia_update_str_6, params=(goods_id, ), logger=self.lg) self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id)) self.goods_index = index + 1 await async_sleep(.3) return goods_id, res else: # 未下架的 res = await self._one_update(item_list=item_list, goods_id=goods_id, tmp_data=tmp_data) else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 self.goods_index = index + 1 collect() return goods_id, res async def _update_db(self) -> None: ''' 秒杀实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.mia_miaosha = MiaParse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.mia_miaosha except: pass collect() async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :param kwargs: :return: ''' res = False item_list = kwargs.get('item_list') goods_id = kwargs.get('goods_id') tmp_data = kwargs.get('tmp_data') begin_time, end_time = await self._get_begin_time_and_end_time(tmp_data ) for item_2 in item_list: if item_2.get('item_id', '') == goods_id: self.mia_miaosha.get_goods_data(goods_id=goods_id) goods_data = self.mia_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(goods_id) goods_data['price'] = item_2.get('active_price') goods_data['taobao_price'] = item_2.get('active_price') goods_data['sub_title'] = item_2.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) res = self.mia_miaosha.update_mia_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) break else: pass return res async def _get_begin_time_and_end_time(self, tmp_data) -> tuple: begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') # 把str字符串类型转换为时间戳的形式 begin_time = int( time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S'))) end_time = int( time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) return begin_time, end_time async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.mia_miaosha except: pass collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') tmp_sql_server._update_table_2(sql_str=yx_update_str_2, params=(item[1],), logger=my_lg) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price'] ) try: old_sku_info = format_price_info_list(price_info_list=json_2_dict(item[7]), site_id=30) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list(data['price_info_list'], site_id=30), is_price_change=item[8] if item[8] is not None else 0 ) yanxuan.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) gc.collect()
async def get_coupon_url_list_by_goods_id_list(self, slice_params_list) -> list: """ 根据给与的goods_id_list来获取对应的coupon_url_list :return: """ def get_create_task_msg(k) -> str: return 'create task[where goods_id: {}, site_id: {}] ...'.format( k['goods_id'], k['site_id'], ) def get_now_args(k) -> list: return [ k['goods_id'], ] all_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=slice_params_list, func_name_where_get_create_task_msg=get_create_task_msg, func_name=self.get_tm_coupon_url_from_lq5u, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res='', step=self.concurrency, logger=self.lg, concurrent_type=self.concurrent_type, func_timeout=25, ) res = [] for item in all_res: if item != '': res.append(item) # 修改对应的goods_id的coupon_check_time sql_str = 'update dbo.GoodsInfoAutoGet set coupon_check_time=%s where GoodsID=%s' sql_cli = SqlServerMyPageInfoSaveItemPipeline() for item in slice_params_list: goods_id = item['goods_id'] coupon_check_time_change_res = False try: coupon_check_time_change_res = sql_cli._update_table_2( sql_str=sql_str, params=( get_shanghai_time(), goods_id, ), logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('[{}] update goods_id: {} coupon_check_time'.format( '+' if coupon_check_time_change_res else '-', goods_id, )) try: del sql_cli except: pass try: del all_res except: pass collect() return res
class ZWMSpider(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/zwm/_/', ) self.init_zwm_pwd() self.concurrency = 20 self.num_retries = 6 self.max_transaction_details_page_num = 20 # 交易截止抓取页 self.max_business_settlement_records_page_num = 20 # 商户结算记录截止抓取页 self.max_business_manage_page_num = 80 # 商户及门店管理截止抓取页(单数据也超过此数量就得进行修改) self.login_cookies_dict = {} self.sleep_time = 5 def init_zwm_pwd(self): ori_data = '' with open(ZWM_PWD_PATH, 'r') as f: for line in f: ori_data += line.replace('\n', '').replace(' ', '') data = json_2_dict( json_str=ori_data, logger=self.lg, default_res={},) self.zwm_username, self.zwm_pwd = data['username'], data['pwd'] assert self.zwm_username != '' and self.zwm_pwd != '' async def _fck_run(self) -> None: while True: try: login_res = await self._login() assert login_res is True, '登录失败, 退出后续同步操作!' # 获取所有交易明细(自己有接口, 不需要了) # all_transaction_details = await self._get_all_transaction_details() # pprint(all_transaction_details) # self.lg.info('len_all_transaction_details: {}'.format(len(all_transaction_details))) # await self._wash_and_save_all_transaction_details(target_list=all_transaction_details) # 获取所有商户结算记录 self.lg.info('获取所有商户结算记录...') all_business_settlement_records = await self._get_all_business_settlement_records_by_something() # pprint(all_business_settlement_records) self.lg.info('len_now_business_settlement_records: {}'.format(len(all_business_settlement_records))) await self._wash_save_all_business_settlement_records(target_list=all_business_settlement_records) self.lg.info('\n') # 获取所有商户及门店管理记录 self.lg.info('获取所有商户及门店管理记录 ...') all_business_manage_records = await self._get_all_business_manage_records_by_something() # pprint(all_business_manage_records) self.lg.info('len_all_business_manage_records: {}'.format(len(all_business_manage_records))) await self._wash_save_all_business_manage_records(target_list=all_business_manage_records) self.lg.info('\n') except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('## 同步完成 ##') self.lg.info('休眠 {} minutes ...'.format(self.sleep_time)) # 定时 await async_sleep(60 * self.sleep_time) async def _login(self) -> bool: """ 登录 :return: """ headers = await self._get_random_pc_headers() headers.update({ 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/loginNew.jsp', }) file_load = { 'loginName': self.zwm_username, 'userPassword': self.zwm_pwd, } m = MultipartEncoder(fields=file_load) # self.lg.info(m) headers.update({ 'Content-Type': m.content_type }) login_url = 'https://agent.yrmpay.com/JHAdminConsole/foreigncard/permissionsLogin.do' with session() as _session: try: response = _session.post( url=login_url, headers=headers, data=m, proxies=self._get_proxies(),) login_res = json_2_dict( json_str=response.text, default_res={}, logger=self.lg, ).get('message', '') assert login_res == '登录成功', '登录失败!' self.lg.info(login_res) self.login_cookies_dict = response.cookies.get_dict() assert self.login_cookies_dict != {}, 'self.login_cookies_dict != 空dict!' # pprint(self.login_cookies_dict) except Exception: self.lg.error('遇到错误:', exc_info=True) return False return True async def _wash_save_all_business_manage_records(self, target_list: list): """ 清洗并存储所有未存储的 or 更新所有已存储的business manage records :param target_list: :return: """ all_res = [] for item in target_list: try: now_time = get_shanghai_time() create_time, modify_time, approval_status_change_time = now_time, now_time, now_time agent_name = item['agentName'] top_agent_name = item['topAgentName'] shop_type = item['merType'] is_high_quality_shop = item['isHighQualityMer'] if is_high_quality_shop == '否': is_high_quality_shop = 0 elif is_high_quality_shop == '是': is_high_quality_shop = 1 else: raise ValueError('is_high_quality_shop value: {} 异常!'.format(is_high_quality_shop)) shop_id = item.get('jhmid', '') assert shop_id != '' shop_chat_name = item.get('merchantName', '') assert shop_chat_name != '' phone_num = item.get('phone', '') assert phone_num != '' shop_chant_num = int(item['merchantNum']) sale = item['sale'] is_real_time = 0 if item['isRealTime'] == '未开通' else 1 approve_date = date_parse(item['approveDate']) rate = Decimal(item['rate']).__round__(4) account_type = item['accType'] apply_time = date_parse(item['applyTime']) # 可为空值 process_context = item.get('processContext', '') is_non_contact = 0 if item['isNonContact'] == '未开通' else 1 approval_status = item['approvalStatus'] if approval_status == '待审核': approval_status = 1 elif approval_status == '审核通过': approval_status = 0 elif approval_status == '退回': approval_status = 2 else: raise ValueError('approval_status value: {} 异常'.format(approval_status)) # 用其原值为定值不变, 且唯一 unique_id = item['id'] except Exception: self.lg.error('遇到错误:', exc_info=True) continue zwm_item = ZWMBusinessManageRecordItem() zwm_item['unique_id'] = unique_id zwm_item['create_time'] = create_time zwm_item['modify_time'] = modify_time zwm_item['agent_name'] = agent_name zwm_item['top_agent_name'] = top_agent_name zwm_item['shop_type'] = shop_type zwm_item['is_high_quality_shop'] = is_high_quality_shop zwm_item['shop_id'] = shop_id zwm_item['shop_chat_name'] = shop_chat_name zwm_item['phone_num'] = phone_num zwm_item['shop_chant_num'] = shop_chant_num zwm_item['sale'] = sale zwm_item['is_real_time'] = is_real_time zwm_item['approve_date'] = approve_date zwm_item['rate'] = rate zwm_item['account_type'] = account_type zwm_item['apply_time'] = apply_time zwm_item['process_context'] = process_context zwm_item['is_non_contact'] = is_non_contact zwm_item['approval_status'] = approval_status zwm_item['approval_status_change_time'] = approval_status_change_time all_res.append(dict(zwm_item)) # 查看 # if shop_id == 'YRMPAY100038574': # if phone_num == '18192242001': # if shop_chat_name == '哇哇叫': # pprint(dict(zwm_item)) # pprint(all_res) await self._insert_or_update_shop_manage_records_table(all_res=all_res) try: del all_res except: pass return None async def _insert_or_update_shop_manage_records_table(self, all_res: list): """ 插入or update原数据 :param all_res: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_2, params=None, logger=self.lg, ) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None new_add_count = 0 for item in all_res: unique_id = item['unique_id'] if unique_id not in db_unique_id_list: # 插入 self.lg.info('inserting unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params2(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_2, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue else: db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time( db_data=db_data, unique_id=unique_id,) item['approval_status_change_time'] = await self._get_new_approval_status_change_time( db_old_approval_status=db_old_approval_status, db_old_approval_status_change_time=db_old_approval_status_change_time, new_approval_status=item['approval_status'], new_approval_status_change_time=item['approval_status_change_time']) # 更新 self.lg.info('updating unique_id: {} ...'.format(unique_id)) params = await self._get_update_item_params(item=item) try: res = self.sql_cli._update_table_2( sql_str=zwm_update_str_1, params=params, logger=self.lg) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass try: del db_data del db_unique_id_list except: pass self.lg.info('table.zwm_buss_manage_records新增个数: {}'.format(new_add_count)) async def _get_new_approval_status_change_time(self, db_old_approval_status, db_old_approval_status_change_time, new_approval_status, new_approval_status_change_time): """ 获取新的approval_status_change_time :return: """ if db_old_approval_status_change_time is not None: new_approval_status_change_time = db_old_approval_status_change_time \ if db_old_approval_status == new_approval_status \ else get_shanghai_time() else: pass return new_approval_status_change_time async def _get_dd_old_approval_status_and_approval_status_change_time(self, db_data: list, unique_id: str) -> tuple: """ 获取db 原先的approval_status :param db_data: :param unique_id: :return: """ for item in db_data: if unique_id == item[0]: return item[1], item[2] else: continue async def _get_all_business_manage_records_by_something(self,): """ 获取所有商户及门店管理记录 :return: """ async def get_tasks_params_list(max_business_manage_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_manage_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_manage_page_num=self.max_business_manage_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_manage_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True,) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_manage_records_by_something(self, page_num: int, start_date: str = None, end_date: str = None,): """ 获取单页商户及门店管理记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-01-27 00:00' :param end_date: eg: '2019-07-20 09:39' :return: """ # todo 获取最开始->至今的, 即采集所有, 避免老店铺的审核状态变动, 而后台无法同步状态, 审核时间 # start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] + ' 00:00' start_date = '2018-01-01 00:00' end_date = (str(get_shanghai_time()) if end_date is None else end_date)[0:16] self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/page.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'merchantCode': '', 'accType': '', 'phone': '', 'approveDate': '', 'merchantName': '', 'processStatus': '', 'startTime': start_date, 'endTime': end_date, 'agentName': '', 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/materialList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('materialList', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _wash_save_all_business_settlement_records(self, target_list): """ 清洗并存储 未被存储的所有商户结算记录 :param target_list: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_1, params=None, logger=self.lg,) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None all_res = [] for item in target_list: # pprint(item) try: create_time = get_shanghai_time() shop_name = item.get('merName', '') assert shop_name != '' shop_id = item.get('mid', '') assert shop_id != '' agent_name = item['agentName'] top_agent_name = item['topAgentName'] date_settle_type = item['settleType'] trans_amount = item.get('transAmt', '') assert trans_amount != '' trans_amount = Decimal(trans_amount).__round__(2) service_charge = Decimal(item['mda']).__round__(2) accounting_amount = Decimal(item['mnamt']).__round__(2) # 正常情况为: '20190704', 异常为'20190824-20190824' txn_day = item['txnDay'] if re.compile('-').findall(txn_day) != []: txn_day = txn_day.split('-')[0] else: pass trans_date = date_parse(txn_day) trans_status = item['status'] if trans_status == '已结算': trans_status = 0 else: raise ValueError('trans_status: {}, 未知交易状态!'.format(trans_status)) settle_type = item['type'] settle_date = date_parse(item['minDay']) # 生成唯一标识码 unique_id = get_uuid3( target_str=shop_id + str(date_settle_type) + str(trans_amount) + \ str(service_charge) + str(trans_date) + \ str(settle_type) + str(settle_date),) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if unique_id in db_unique_id_list: # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id)) continue settle_record_item = ZWMBusinessSettlementRecordItem() settle_record_item['unique_id'] = unique_id settle_record_item['create_time'] = create_time settle_record_item['shop_name'] = shop_name settle_record_item['shop_id'] = shop_id settle_record_item['agent_name'] = agent_name settle_record_item['top_agent_name'] = top_agent_name settle_record_item['date_settle_type'] = date_settle_type settle_record_item['trans_amount'] = trans_amount settle_record_item['service_charge'] = service_charge settle_record_item['accounting_amount'] = accounting_amount settle_record_item['trans_date'] = trans_date settle_record_item['trans_status'] = trans_status settle_record_item['settle_type'] = settle_type settle_record_item['settle_date'] = settle_date all_res.append(dict(settle_record_item)) # pprint(all_res) self.lg.info('未存储个数: {}'.format(len(all_res))) await self._save_all_business_settlement_records(all_res=all_res) try: del all_res except: pass return None async def _save_all_business_settlement_records(self, all_res) -> None: """ 存储新增的商家提现记录 :param all_res: :return: """ new_add_count = 0 for item in all_res: # 处理未存储的新数据 unique_id = item['unique_id'] self.lg.info('saving unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_1, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass self.lg.info('新增个数: {}'.format(new_add_count)) return None async def _get_insert_item_params(self, item) -> tuple: """ 待插入对象 :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['shop_name'], item['shop_id'], item['agent_name'], item['top_agent_name'], item['date_settle_type'], item['trans_amount'], item['service_charge'], item['accounting_amount'], item['trans_date'], item['trans_status'], item['settle_type'], item['settle_date'], ]) async def _get_insert_item_params2(self, item) -> tuple: """ 待插入对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], ]) async def _get_update_item_params(self, item: dict) -> tuple: """ 更新对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], item['unique_id'], ]) async def _wash_and_save_all_transaction_details(self, target_list: list): """ 清洗并存储所有交易明细 :param target_list: :return: """ pass async def _get_all_business_settlement_records_by_something(self): """ 获取所有商户结算记录 :return: """ async def get_tasks_params_list(max_business_settlement_records_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_settlement_records_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_settlement_records_page_num=self.max_business_settlement_records_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_settlement_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True,) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_settlement_records_by_something(self, page_num :int, start_date: str=None, end_date: str=None, mid: str='', agent_name: str='') -> list: """ 得到单页商户结算记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-07-01' :param end_date: eg: '2019-07-16' :param mid: 商户编号 :param agent_name: 顶级机构名称 :return: """ start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] # start_date = '2018-01-01' end_date = (str(get_shanghai_time()) if end_date is None else end_date).split(' ')[0] self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merSettle/querySettleJsp.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'mid': mid, 'agentName': agent_name, 'loginAgentId': self.zwm_username[0:8], # 前8位 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置, 0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merSettle/queryMerSettleList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) # self.lg.info(body) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _get_all_transaction_details(self) -> list: """ 获取所有交易流水 :return: """ async def _get_tasks_params_list() -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, self.max_transaction_details_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list tasks_params_list = await _get_tasks_params_list() tasks_params_list_obj = TasksParamsListObj( tasks_params_list=tasks_params_list, step=self.concurrency,) all_res = [] while True: try: slice_params_list = tasks_params_list_obj.__next__() except AssertionError: break tasks = [] for k in slice_params_list: page_num = k['page_num'] self.lg.info('create task[where page_num: {}]...'.format(page_num)) func_args = [ page_num, ] tasks.append(self.loop.create_task( unblock_func( func_name=self._get_one_page_transaction_details_by_something, func_args=func_args, logger=self.lg,))) one_res = await async_wait_tasks_finished(tasks=tasks) try: del tasks except: pass for i in one_res: for j in i: all_res.append(j) return all_res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_transaction_details_by_something(self, page_num: int, start_date: str=None, end_date: str=None, transaction_status: str='', mer_name: str='', order_no: str='', mid: str='', agent_name: str='', pay_channel: str ='', sale_name: str='',) -> list: """ 获取单页交易流水 :param page_num: 开始页面, eg: 1, 2, 3 :param start_date: eg: '2019-07-16 00:00' :param end_data: eg: '2019-07-16 10:02' :param transaction_status: 交易状态 | 选择全部: '' or 交易成功: '1' or 退款成功: '3' :param mer_name: 待查询的商户名称 :param order_no: 订单号 :param mid: 商户编号 :param agent_name: 顶级机构名称 :param pay_channel: 支付渠道 | 请选择: '' or 微信: '50' or 支付宝: '51' or 微信条码: '55' or 支付宝条码: '56' or 微信小程序: '67' :param sale_name: 销售名称 :return: """ res = [] start_date = self.get_0_00_on_the_day() if start_date is None else start_date end_date = str(get_shanghai_time()) if end_date is None else end_date headers = self.get_random_pc_headers() headers.update({ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/transflow.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'type': '2', 'status': transaction_status, 'payChannel': pay_channel, 'orderNo': order_no, 'merName': mer_name, 'mid': mid, 'agentName': agent_name, 'saleName': sale_name, 'page': str(page_num), 'start': str((page_num - 1) * 20), # 开始位置, 0, 20, 40 'limit': '20', } url = 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/querylimafuTransFlow.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res def get_0_00_on_the_day(self) -> str: """ 获取当天的0点 :return: """ now_time = get_shanghai_time() return str(datetime( year=now_time.year, month=now_time.month, day=now_time.day)) def get_1_on_the_month(self) -> str: """ 获取当月的第一天 :return: """ now_time = get_shanghai_time() # 避免月底流水无法获取 day = 5 now_month = now_time.month if now_month > 1: now_month -= 1 else: # now_month为1月份 now_month = 12 return str(datetime( year=now_time.year, month=now_month, day=day,)) def _get_proxies(self) -> dict: """ 获取代理 :return: """ proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type, ) assert proxies != {}, 'proxies不为空dict!' return proxies async def _get_random_pc_headers(self) -> dict: """ :return: """ return self.get_random_pc_headers() @staticmethod def get_random_pc_headers() -> dict: headers = get_random_headers( upgrade_insecure_requests=False, cache_control='',) headers.update({ 'Origin': 'https://agent.yrmpay.com', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarytSJCAoaErjNY4IbM', 'accept': 'text/plain, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', }) return headers def __del__(self): try: del self.lg del self.login_cookies_dict except: pass try: del self.loop except: pass collect()